diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42423 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9978900335112324, + "eval_steps": 504, + "global_step": 6042, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004964627032394191, + "grad_norm": 0.48562569977411674, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.8809, + "step": 1 + }, + { + "epoch": 0.0004964627032394191, + "eval_loss": 0.9288526773452759, + "eval_runtime": 135.0527, + "eval_samples_per_second": 224.749, + "eval_steps_per_second": 28.1, + "step": 1 + }, + { + "epoch": 0.0009929254064788382, + "grad_norm": 0.49840719985370363, + "learning_rate": 5.000000000000001e-07, + "loss": 0.8877, + "step": 2 + }, + { + "epoch": 0.0014893881097182574, + "grad_norm": 0.5073931939592379, + "learning_rate": 7.5e-07, + "loss": 0.912, + "step": 3 + }, + { + "epoch": 0.0019858508129576764, + "grad_norm": 0.4677170410046681, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.953, + "step": 4 + }, + { + "epoch": 0.002482313516197096, + "grad_norm": 0.48958186047439367, + "learning_rate": 1.25e-06, + "loss": 0.9888, + "step": 5 + }, + { + "epoch": 0.002978776219436515, + "grad_norm": 0.47591967932406565, + "learning_rate": 1.5e-06, + "loss": 0.9378, + "step": 6 + }, + { + "epoch": 0.0034752389226759338, + "grad_norm": 0.4775015628908847, + "learning_rate": 1.75e-06, + "loss": 0.9636, + "step": 7 + }, + { + "epoch": 0.003971701625915353, + "grad_norm": 0.4636218067902225, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.9179, + "step": 8 + }, + { + "epoch": 0.004468164329154772, + "grad_norm": 0.4627241033337522, + "learning_rate": 2.25e-06, + "loss": 0.9676, + "step": 9 + }, + { + "epoch": 0.004964627032394192, + "grad_norm": 0.4866597760611659, + "learning_rate": 2.5e-06, + "loss": 0.9696, + "step": 10 + }, + { + "epoch": 0.00546108973563361, + "grad_norm": 0.4791346551664399, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.907, + "step": 11 + }, + { + "epoch": 0.00595755243887303, + "grad_norm": 0.4769913096433881, + "learning_rate": 3e-06, + "loss": 0.8921, + "step": 12 + }, + { + "epoch": 0.006454015142112449, + "grad_norm": 0.34294431274413834, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.926, + "step": 13 + }, + { + "epoch": 0.0069504778453518675, + "grad_norm": 0.34641042017011436, + "learning_rate": 3.5e-06, + "loss": 0.9275, + "step": 14 + }, + { + "epoch": 0.007446940548591287, + "grad_norm": 0.34905473483457555, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8703, + "step": 15 + }, + { + "epoch": 0.007943403251830706, + "grad_norm": 0.3172635186340447, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9291, + "step": 16 + }, + { + "epoch": 0.008439865955070125, + "grad_norm": 0.2879283799693857, + "learning_rate": 4.25e-06, + "loss": 0.8477, + "step": 17 + }, + { + "epoch": 0.008936328658309544, + "grad_norm": 0.27723518517408907, + "learning_rate": 4.5e-06, + "loss": 0.8719, + "step": 18 + }, + { + "epoch": 0.009432791361548964, + "grad_norm": 0.3064644489149753, + "learning_rate": 4.75e-06, + "loss": 0.8433, + "step": 19 + }, + { + "epoch": 0.009929254064788383, + "grad_norm": 0.29412214176584833, + "learning_rate": 5e-06, + "loss": 0.859, + "step": 20 + }, + { + "epoch": 0.010425716768027803, + "grad_norm": 0.2822069566103716, + "learning_rate": 4.999999808957543e-06, + "loss": 0.8315, + "step": 21 + }, + { + "epoch": 0.01092217947126722, + "grad_norm": 0.2959170002125076, + "learning_rate": 4.9999992358301984e-06, + "loss": 0.8518, + "step": 22 + }, + { + "epoch": 0.01141864217450664, + "grad_norm": 0.2847305128418995, + "learning_rate": 4.9999982806180555e-06, + "loss": 0.9263, + "step": 23 + }, + { + "epoch": 0.01191510487774606, + "grad_norm": 0.2552021781953879, + "learning_rate": 4.99999694332126e-06, + "loss": 0.8462, + "step": 24 + }, + { + "epoch": 0.012411567580985479, + "grad_norm": 0.24750889112085914, + "learning_rate": 4.9999952239400165e-06, + "loss": 0.8324, + "step": 25 + }, + { + "epoch": 0.012908030284224898, + "grad_norm": 0.23795567702694292, + "learning_rate": 4.9999931224745864e-06, + "loss": 0.8167, + "step": 26 + }, + { + "epoch": 0.013404492987464317, + "grad_norm": 0.25300589875266577, + "learning_rate": 4.9999906389252926e-06, + "loss": 0.8979, + "step": 27 + }, + { + "epoch": 0.013900955690703735, + "grad_norm": 0.25172921500056106, + "learning_rate": 4.9999877732925135e-06, + "loss": 0.8962, + "step": 28 + }, + { + "epoch": 0.014397418393943155, + "grad_norm": 0.24565475616345825, + "learning_rate": 4.999984525576688e-06, + "loss": 0.8919, + "step": 29 + }, + { + "epoch": 0.014893881097182574, + "grad_norm": 0.23769439461231937, + "learning_rate": 4.999980895778312e-06, + "loss": 0.8506, + "step": 30 + }, + { + "epoch": 0.015390343800421993, + "grad_norm": 0.23162836109371954, + "learning_rate": 4.999976883897939e-06, + "loss": 0.9284, + "step": 31 + }, + { + "epoch": 0.01588680650366141, + "grad_norm": 0.2291685811405672, + "learning_rate": 4.999972489936185e-06, + "loss": 0.8594, + "step": 32 + }, + { + "epoch": 0.016383269206900832, + "grad_norm": 0.22080073693898697, + "learning_rate": 4.9999677138937185e-06, + "loss": 0.8606, + "step": 33 + }, + { + "epoch": 0.01687973191014025, + "grad_norm": 0.2168698203296081, + "learning_rate": 4.999962555771272e-06, + "loss": 0.8781, + "step": 34 + }, + { + "epoch": 0.01737619461337967, + "grad_norm": 0.22362106702009418, + "learning_rate": 4.999957015569632e-06, + "loss": 0.8172, + "step": 35 + }, + { + "epoch": 0.01787265731661909, + "grad_norm": 0.21848314476434746, + "learning_rate": 4.999951093289645e-06, + "loss": 0.823, + "step": 36 + }, + { + "epoch": 0.018369120019858506, + "grad_norm": 0.21837246279483064, + "learning_rate": 4.9999447889322195e-06, + "loss": 0.8576, + "step": 37 + }, + { + "epoch": 0.018865582723097928, + "grad_norm": 0.21667015902394673, + "learning_rate": 4.999938102498315e-06, + "loss": 0.8648, + "step": 38 + }, + { + "epoch": 0.019362045426337345, + "grad_norm": 0.2202153889422801, + "learning_rate": 4.9999310339889554e-06, + "loss": 0.9336, + "step": 39 + }, + { + "epoch": 0.019858508129576766, + "grad_norm": 0.22077174681701484, + "learning_rate": 4.9999235834052204e-06, + "loss": 0.8857, + "step": 40 + }, + { + "epoch": 0.020354970832816184, + "grad_norm": 0.20143789350780875, + "learning_rate": 4.9999157507482485e-06, + "loss": 0.8106, + "step": 41 + }, + { + "epoch": 0.020851433536055605, + "grad_norm": 0.21195838433230027, + "learning_rate": 4.999907536019238e-06, + "loss": 0.8221, + "step": 42 + }, + { + "epoch": 0.021347896239295023, + "grad_norm": 0.20602305261000692, + "learning_rate": 4.999898939219443e-06, + "loss": 0.8578, + "step": 43 + }, + { + "epoch": 0.02184435894253444, + "grad_norm": 0.21153710576925158, + "learning_rate": 4.999889960350179e-06, + "loss": 0.8217, + "step": 44 + }, + { + "epoch": 0.022340821645773862, + "grad_norm": 0.21344887165712584, + "learning_rate": 4.9998805994128166e-06, + "loss": 0.8838, + "step": 45 + }, + { + "epoch": 0.02283728434901328, + "grad_norm": 0.21002096278796756, + "learning_rate": 4.999870856408787e-06, + "loss": 0.8782, + "step": 46 + }, + { + "epoch": 0.0233337470522527, + "grad_norm": 0.20886971438867197, + "learning_rate": 4.999860731339579e-06, + "loss": 0.8295, + "step": 47 + }, + { + "epoch": 0.02383020975549212, + "grad_norm": 0.22095165664633118, + "learning_rate": 4.999850224206741e-06, + "loss": 0.8687, + "step": 48 + }, + { + "epoch": 0.02432667245873154, + "grad_norm": 0.21253844707230096, + "learning_rate": 4.999839335011878e-06, + "loss": 0.8782, + "step": 49 + }, + { + "epoch": 0.024823135161970957, + "grad_norm": 0.21604040244789194, + "learning_rate": 4.999828063756655e-06, + "loss": 0.8385, + "step": 50 + }, + { + "epoch": 0.025319597865210375, + "grad_norm": 0.21102395354985817, + "learning_rate": 4.999816410442794e-06, + "loss": 0.8903, + "step": 51 + }, + { + "epoch": 0.025816060568449796, + "grad_norm": 0.2062210328133959, + "learning_rate": 4.999804375072076e-06, + "loss": 0.8717, + "step": 52 + }, + { + "epoch": 0.026312523271689214, + "grad_norm": 0.22296430893025995, + "learning_rate": 4.999791957646341e-06, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.026808985974928635, + "grad_norm": 0.2172722843835763, + "learning_rate": 4.9997791581674855e-06, + "loss": 0.9088, + "step": 54 + }, + { + "epoch": 0.027305448678168052, + "grad_norm": 0.2097980373922686, + "learning_rate": 4.999765976637467e-06, + "loss": 0.8559, + "step": 55 + }, + { + "epoch": 0.02780191138140747, + "grad_norm": 0.20153765827962083, + "learning_rate": 4.9997524130583e-06, + "loss": 0.8233, + "step": 56 + }, + { + "epoch": 0.02829837408464689, + "grad_norm": 0.20769665963331085, + "learning_rate": 4.999738467432057e-06, + "loss": 0.8321, + "step": 57 + }, + { + "epoch": 0.02879483678788631, + "grad_norm": 0.21216448449114564, + "learning_rate": 4.999724139760869e-06, + "loss": 0.847, + "step": 58 + }, + { + "epoch": 0.02929129949112573, + "grad_norm": 0.2063639919391254, + "learning_rate": 4.999709430046926e-06, + "loss": 0.8064, + "step": 59 + }, + { + "epoch": 0.029787762194365148, + "grad_norm": 0.21452334497936051, + "learning_rate": 4.999694338292478e-06, + "loss": 0.8838, + "step": 60 + }, + { + "epoch": 0.03028422489760457, + "grad_norm": 0.20061672335186068, + "learning_rate": 4.999678864499828e-06, + "loss": 0.7818, + "step": 61 + }, + { + "epoch": 0.030780687600843987, + "grad_norm": 0.2046523532499392, + "learning_rate": 4.999663008671344e-06, + "loss": 0.8581, + "step": 62 + }, + { + "epoch": 0.03127715030408341, + "grad_norm": 0.20186628257277983, + "learning_rate": 4.999646770809449e-06, + "loss": 0.8445, + "step": 63 + }, + { + "epoch": 0.03177361300732282, + "grad_norm": 0.2012482614306437, + "learning_rate": 4.9996301509166225e-06, + "loss": 0.8138, + "step": 64 + }, + { + "epoch": 0.03227007571056224, + "grad_norm": 0.20616591148685806, + "learning_rate": 4.999613148995406e-06, + "loss": 0.854, + "step": 65 + }, + { + "epoch": 0.032766538413801664, + "grad_norm": 0.20274972415160764, + "learning_rate": 4.999595765048399e-06, + "loss": 0.8223, + "step": 66 + }, + { + "epoch": 0.033263001117041086, + "grad_norm": 0.20826308052556233, + "learning_rate": 4.9995779990782556e-06, + "loss": 0.8669, + "step": 67 + }, + { + "epoch": 0.0337594638202805, + "grad_norm": 0.20529737714316182, + "learning_rate": 4.999559851087694e-06, + "loss": 0.856, + "step": 68 + }, + { + "epoch": 0.03425592652351992, + "grad_norm": 0.2098299357116322, + "learning_rate": 4.999541321079486e-06, + "loss": 0.895, + "step": 69 + }, + { + "epoch": 0.03475238922675934, + "grad_norm": 0.21085272972883423, + "learning_rate": 4.9995224090564645e-06, + "loss": 0.8135, + "step": 70 + }, + { + "epoch": 0.035248851929998756, + "grad_norm": 0.20815593231474888, + "learning_rate": 4.9995031150215194e-06, + "loss": 0.8325, + "step": 71 + }, + { + "epoch": 0.03574531463323818, + "grad_norm": 0.2030397777365442, + "learning_rate": 4.9994834389776e-06, + "loss": 0.8028, + "step": 72 + }, + { + "epoch": 0.0362417773364776, + "grad_norm": 0.2093993861940969, + "learning_rate": 4.999463380927713e-06, + "loss": 0.8222, + "step": 73 + }, + { + "epoch": 0.03673824003971701, + "grad_norm": 0.20310206636138728, + "learning_rate": 4.9994429408749235e-06, + "loss": 0.8987, + "step": 74 + }, + { + "epoch": 0.037234702742956434, + "grad_norm": 0.19635518255581516, + "learning_rate": 4.999422118822357e-06, + "loss": 0.8257, + "step": 75 + }, + { + "epoch": 0.037731165446195855, + "grad_norm": 0.20613686945426812, + "learning_rate": 4.999400914773193e-06, + "loss": 0.8519, + "step": 76 + }, + { + "epoch": 0.038227628149435276, + "grad_norm": 0.2148909458983486, + "learning_rate": 4.999379328730676e-06, + "loss": 0.8431, + "step": 77 + }, + { + "epoch": 0.03872409085267469, + "grad_norm": 0.20131996731845622, + "learning_rate": 4.999357360698103e-06, + "loss": 0.8298, + "step": 78 + }, + { + "epoch": 0.03922055355591411, + "grad_norm": 0.19799106731473673, + "learning_rate": 4.999335010678831e-06, + "loss": 0.8438, + "step": 79 + }, + { + "epoch": 0.03971701625915353, + "grad_norm": 0.20461179643215718, + "learning_rate": 4.999312278676276e-06, + "loss": 0.8501, + "step": 80 + }, + { + "epoch": 0.04021347896239295, + "grad_norm": 0.19950036085187078, + "learning_rate": 4.999289164693913e-06, + "loss": 0.802, + "step": 81 + }, + { + "epoch": 0.04070994166563237, + "grad_norm": 0.20647834483552271, + "learning_rate": 4.999265668735274e-06, + "loss": 0.8004, + "step": 82 + }, + { + "epoch": 0.04120640436887179, + "grad_norm": 0.20535506020956135, + "learning_rate": 4.99924179080395e-06, + "loss": 0.8398, + "step": 83 + }, + { + "epoch": 0.04170286707211121, + "grad_norm": 0.20379519814931177, + "learning_rate": 4.999217530903592e-06, + "loss": 0.8493, + "step": 84 + }, + { + "epoch": 0.042199329775350625, + "grad_norm": 0.19569632518773314, + "learning_rate": 4.999192889037905e-06, + "loss": 0.7781, + "step": 85 + }, + { + "epoch": 0.042695792478590046, + "grad_norm": 0.2022603888395999, + "learning_rate": 4.999167865210656e-06, + "loss": 0.8202, + "step": 86 + }, + { + "epoch": 0.04319225518182947, + "grad_norm": 0.2061084529311429, + "learning_rate": 4.999142459425671e-06, + "loss": 0.8112, + "step": 87 + }, + { + "epoch": 0.04368871788506888, + "grad_norm": 0.20386460325471262, + "learning_rate": 4.999116671686832e-06, + "loss": 0.8314, + "step": 88 + }, + { + "epoch": 0.0441851805883083, + "grad_norm": 0.2001689787137877, + "learning_rate": 4.9990905019980795e-06, + "loss": 0.8493, + "step": 89 + }, + { + "epoch": 0.044681643291547724, + "grad_norm": 0.19704962097184103, + "learning_rate": 4.999063950363413e-06, + "loss": 0.8125, + "step": 90 + }, + { + "epoch": 0.045178105994787145, + "grad_norm": 0.20288945888095392, + "learning_rate": 4.999037016786891e-06, + "loss": 0.8529, + "step": 91 + }, + { + "epoch": 0.04567456869802656, + "grad_norm": 0.20036286496607184, + "learning_rate": 4.999009701272632e-06, + "loss": 0.818, + "step": 92 + }, + { + "epoch": 0.04617103140126598, + "grad_norm": 0.19540166087377867, + "learning_rate": 4.998982003824807e-06, + "loss": 0.8044, + "step": 93 + }, + { + "epoch": 0.0466674941045054, + "grad_norm": 0.19654628372196933, + "learning_rate": 4.998953924447652e-06, + "loss": 0.8276, + "step": 94 + }, + { + "epoch": 0.047163956807744815, + "grad_norm": 0.20737336312858767, + "learning_rate": 4.998925463145456e-06, + "loss": 0.792, + "step": 95 + }, + { + "epoch": 0.04766041951098424, + "grad_norm": 0.2059319669770031, + "learning_rate": 4.998896619922571e-06, + "loss": 0.8635, + "step": 96 + }, + { + "epoch": 0.04815688221422366, + "grad_norm": 0.20528654692184178, + "learning_rate": 4.9988673947834045e-06, + "loss": 0.8605, + "step": 97 + }, + { + "epoch": 0.04865334491746308, + "grad_norm": 0.20472541684877216, + "learning_rate": 4.998837787732422e-06, + "loss": 0.87, + "step": 98 + }, + { + "epoch": 0.04914980762070249, + "grad_norm": 0.20212979201560113, + "learning_rate": 4.998807798774151e-06, + "loss": 0.8236, + "step": 99 + }, + { + "epoch": 0.049646270323941914, + "grad_norm": 0.19931273766196522, + "learning_rate": 4.998777427913172e-06, + "loss": 0.8541, + "step": 100 + }, + { + "epoch": 0.050142733027181335, + "grad_norm": 0.2060934643730322, + "learning_rate": 4.998746675154129e-06, + "loss": 0.8765, + "step": 101 + }, + { + "epoch": 0.05063919573042075, + "grad_norm": 0.19905720321276965, + "learning_rate": 4.99871554050172e-06, + "loss": 0.8193, + "step": 102 + }, + { + "epoch": 0.05113565843366017, + "grad_norm": 0.19654055207721635, + "learning_rate": 4.998684023960705e-06, + "loss": 0.804, + "step": 103 + }, + { + "epoch": 0.05163212113689959, + "grad_norm": 0.1982198351814607, + "learning_rate": 4.998652125535901e-06, + "loss": 0.7891, + "step": 104 + }, + { + "epoch": 0.052128583840139006, + "grad_norm": 0.39942501787277024, + "learning_rate": 4.998619845232181e-06, + "loss": 0.8563, + "step": 105 + }, + { + "epoch": 0.05262504654337843, + "grad_norm": 0.1955611794394933, + "learning_rate": 4.998587183054481e-06, + "loss": 0.8287, + "step": 106 + }, + { + "epoch": 0.05312150924661785, + "grad_norm": 0.20389389297242477, + "learning_rate": 4.9985541390077915e-06, + "loss": 0.8284, + "step": 107 + }, + { + "epoch": 0.05361797194985727, + "grad_norm": 0.21045434439040575, + "learning_rate": 4.998520713097164e-06, + "loss": 0.8608, + "step": 108 + }, + { + "epoch": 0.054114434653096684, + "grad_norm": 0.20116892551504365, + "learning_rate": 4.998486905327704e-06, + "loss": 0.8359, + "step": 109 + }, + { + "epoch": 0.054610897356336105, + "grad_norm": 0.21230325768339284, + "learning_rate": 4.9984527157045825e-06, + "loss": 0.8218, + "step": 110 + }, + { + "epoch": 0.055107360059575526, + "grad_norm": 0.19875350435158548, + "learning_rate": 4.998418144233023e-06, + "loss": 0.8115, + "step": 111 + }, + { + "epoch": 0.05560382276281494, + "grad_norm": 0.19722218299068556, + "learning_rate": 4.998383190918309e-06, + "loss": 0.8409, + "step": 112 + }, + { + "epoch": 0.05610028546605436, + "grad_norm": 0.19442312486929395, + "learning_rate": 4.998347855765783e-06, + "loss": 0.7861, + "step": 113 + }, + { + "epoch": 0.05659674816929378, + "grad_norm": 0.19986199846633768, + "learning_rate": 4.998312138780845e-06, + "loss": 0.8412, + "step": 114 + }, + { + "epoch": 0.057093210872533204, + "grad_norm": 0.19565952095909392, + "learning_rate": 4.998276039968953e-06, + "loss": 0.8415, + "step": 115 + }, + { + "epoch": 0.05758967357577262, + "grad_norm": 0.19154584275348677, + "learning_rate": 4.998239559335627e-06, + "loss": 0.8631, + "step": 116 + }, + { + "epoch": 0.05808613627901204, + "grad_norm": 0.19665878881747706, + "learning_rate": 4.99820269688644e-06, + "loss": 0.8598, + "step": 117 + }, + { + "epoch": 0.05858259898225146, + "grad_norm": 0.2028047172295385, + "learning_rate": 4.998165452627025e-06, + "loss": 0.8683, + "step": 118 + }, + { + "epoch": 0.059079061685490875, + "grad_norm": 0.2037639166726157, + "learning_rate": 4.998127826563077e-06, + "loss": 0.8112, + "step": 119 + }, + { + "epoch": 0.059575524388730296, + "grad_norm": 0.1959085397655898, + "learning_rate": 4.998089818700344e-06, + "loss": 0.8463, + "step": 120 + }, + { + "epoch": 0.06007198709196972, + "grad_norm": 0.19554313432021866, + "learning_rate": 4.998051429044638e-06, + "loss": 0.814, + "step": 121 + }, + { + "epoch": 0.06056844979520914, + "grad_norm": 0.21327067999617655, + "learning_rate": 4.998012657601823e-06, + "loss": 0.8369, + "step": 122 + }, + { + "epoch": 0.06106491249844855, + "grad_norm": 0.19482066937524098, + "learning_rate": 4.997973504377826e-06, + "loss": 0.8503, + "step": 123 + }, + { + "epoch": 0.06156137520168797, + "grad_norm": 0.1921263735287261, + "learning_rate": 4.99793396937863e-06, + "loss": 0.8235, + "step": 124 + }, + { + "epoch": 0.062057837904927395, + "grad_norm": 0.1912317820741509, + "learning_rate": 4.99789405261028e-06, + "loss": 0.7959, + "step": 125 + }, + { + "epoch": 0.06255430060816682, + "grad_norm": 0.1930012329719747, + "learning_rate": 4.997853754078873e-06, + "loss": 0.8312, + "step": 126 + }, + { + "epoch": 0.06305076331140623, + "grad_norm": 0.1854436986074747, + "learning_rate": 4.997813073790571e-06, + "loss": 0.7915, + "step": 127 + }, + { + "epoch": 0.06354722601464564, + "grad_norm": 0.19266710662549197, + "learning_rate": 4.997772011751589e-06, + "loss": 0.8953, + "step": 128 + }, + { + "epoch": 0.06404368871788507, + "grad_norm": 0.18944442325778477, + "learning_rate": 4.9977305679682044e-06, + "loss": 0.7999, + "step": 129 + }, + { + "epoch": 0.06454015142112449, + "grad_norm": 0.19586350088433074, + "learning_rate": 4.99768874244675e-06, + "loss": 0.8091, + "step": 130 + }, + { + "epoch": 0.0650366141243639, + "grad_norm": 0.19084476465979933, + "learning_rate": 4.997646535193618e-06, + "loss": 0.8733, + "step": 131 + }, + { + "epoch": 0.06553307682760333, + "grad_norm": 0.1884614630341233, + "learning_rate": 4.997603946215262e-06, + "loss": 0.8106, + "step": 132 + }, + { + "epoch": 0.06602953953084274, + "grad_norm": 0.1932654239334205, + "learning_rate": 4.9975609755181875e-06, + "loss": 0.9149, + "step": 133 + }, + { + "epoch": 0.06652600223408217, + "grad_norm": 0.1872082345438042, + "learning_rate": 4.997517623108964e-06, + "loss": 0.7794, + "step": 134 + }, + { + "epoch": 0.06702246493732159, + "grad_norm": 0.1873912754282765, + "learning_rate": 4.997473888994215e-06, + "loss": 0.8577, + "step": 135 + }, + { + "epoch": 0.067518927640561, + "grad_norm": 0.18175316620522922, + "learning_rate": 4.997429773180627e-06, + "loss": 0.8226, + "step": 136 + }, + { + "epoch": 0.06801539034380043, + "grad_norm": 0.17684485991325427, + "learning_rate": 4.997385275674942e-06, + "loss": 0.7802, + "step": 137 + }, + { + "epoch": 0.06851185304703984, + "grad_norm": 0.1824706517270434, + "learning_rate": 4.99734039648396e-06, + "loss": 0.7985, + "step": 138 + }, + { + "epoch": 0.06900831575027926, + "grad_norm": 0.1799259476989057, + "learning_rate": 4.997295135614539e-06, + "loss": 0.8036, + "step": 139 + }, + { + "epoch": 0.06950477845351868, + "grad_norm": 0.17823278052011338, + "learning_rate": 4.997249493073598e-06, + "loss": 0.8167, + "step": 140 + }, + { + "epoch": 0.0700012411567581, + "grad_norm": 0.18252365188173084, + "learning_rate": 4.997203468868113e-06, + "loss": 0.8295, + "step": 141 + }, + { + "epoch": 0.07049770385999751, + "grad_norm": 0.19736868569754917, + "learning_rate": 4.997157063005117e-06, + "loss": 0.8908, + "step": 142 + }, + { + "epoch": 0.07099416656323694, + "grad_norm": 0.17947955446070774, + "learning_rate": 4.997110275491702e-06, + "loss": 0.8115, + "step": 143 + }, + { + "epoch": 0.07149062926647635, + "grad_norm": 0.19402407753710812, + "learning_rate": 4.997063106335021e-06, + "loss": 0.8454, + "step": 144 + }, + { + "epoch": 0.07198709196971577, + "grad_norm": 0.1887305860550766, + "learning_rate": 4.99701555554228e-06, + "loss": 0.8945, + "step": 145 + }, + { + "epoch": 0.0724835546729552, + "grad_norm": 0.17558377313650073, + "learning_rate": 4.99696762312075e-06, + "loss": 0.8009, + "step": 146 + }, + { + "epoch": 0.07298001737619461, + "grad_norm": 0.1718469916803291, + "learning_rate": 4.9969193090777526e-06, + "loss": 0.8143, + "step": 147 + }, + { + "epoch": 0.07347648007943403, + "grad_norm": 0.17617757928734015, + "learning_rate": 4.996870613420675e-06, + "loss": 0.8217, + "step": 148 + }, + { + "epoch": 0.07397294278267345, + "grad_norm": 0.1706971696924589, + "learning_rate": 4.996821536156958e-06, + "loss": 0.7952, + "step": 149 + }, + { + "epoch": 0.07446940548591287, + "grad_norm": 0.18103564748682863, + "learning_rate": 4.996772077294103e-06, + "loss": 0.8214, + "step": 150 + }, + { + "epoch": 0.0749658681891523, + "grad_norm": 0.18622213669793447, + "learning_rate": 4.9967222368396686e-06, + "loss": 0.8843, + "step": 151 + }, + { + "epoch": 0.07546233089239171, + "grad_norm": 0.1707461817930418, + "learning_rate": 4.9966720148012714e-06, + "loss": 0.7733, + "step": 152 + }, + { + "epoch": 0.07595879359563112, + "grad_norm": 0.15640519366869854, + "learning_rate": 4.996621411186589e-06, + "loss": 0.7887, + "step": 153 + }, + { + "epoch": 0.07645525629887055, + "grad_norm": 0.1624602295916605, + "learning_rate": 4.996570426003354e-06, + "loss": 0.8471, + "step": 154 + }, + { + "epoch": 0.07695171900210997, + "grad_norm": 0.18290118651942194, + "learning_rate": 4.996519059259358e-06, + "loss": 0.8467, + "step": 155 + }, + { + "epoch": 0.07744818170534938, + "grad_norm": 0.15912757799549243, + "learning_rate": 4.996467310962453e-06, + "loss": 0.8005, + "step": 156 + }, + { + "epoch": 0.07794464440858881, + "grad_norm": 0.1613389987950997, + "learning_rate": 4.996415181120547e-06, + "loss": 0.7892, + "step": 157 + }, + { + "epoch": 0.07844110711182822, + "grad_norm": 0.1570677524283997, + "learning_rate": 4.996362669741609e-06, + "loss": 0.8286, + "step": 158 + }, + { + "epoch": 0.07893756981506764, + "grad_norm": 0.16537699681870446, + "learning_rate": 4.996309776833661e-06, + "loss": 0.8273, + "step": 159 + }, + { + "epoch": 0.07943403251830707, + "grad_norm": 0.17043045232874193, + "learning_rate": 4.99625650240479e-06, + "loss": 0.7966, + "step": 160 + }, + { + "epoch": 0.07993049522154648, + "grad_norm": 0.15656286097485822, + "learning_rate": 4.9962028464631365e-06, + "loss": 0.8332, + "step": 161 + }, + { + "epoch": 0.0804269579247859, + "grad_norm": 0.1776462779512006, + "learning_rate": 4.9961488090169015e-06, + "loss": 0.819, + "step": 162 + }, + { + "epoch": 0.08092342062802532, + "grad_norm": 0.15898779205316405, + "learning_rate": 4.996094390074345e-06, + "loss": 0.8326, + "step": 163 + }, + { + "epoch": 0.08141988333126474, + "grad_norm": 0.16851878425262465, + "learning_rate": 4.996039589643782e-06, + "loss": 0.8636, + "step": 164 + }, + { + "epoch": 0.08191634603450416, + "grad_norm": 0.14941991731340487, + "learning_rate": 4.995984407733588e-06, + "loss": 0.7804, + "step": 165 + }, + { + "epoch": 0.08241280873774358, + "grad_norm": 0.16021285793408913, + "learning_rate": 4.995928844352198e-06, + "loss": 0.8614, + "step": 166 + }, + { + "epoch": 0.08290927144098299, + "grad_norm": 0.1592609781313818, + "learning_rate": 4.995872899508103e-06, + "loss": 0.8355, + "step": 167 + }, + { + "epoch": 0.08340573414422242, + "grad_norm": 0.14893046373707292, + "learning_rate": 4.995816573209854e-06, + "loss": 0.8146, + "step": 168 + }, + { + "epoch": 0.08390219684746184, + "grad_norm": 0.14994377964296016, + "learning_rate": 4.995759865466059e-06, + "loss": 0.8073, + "step": 169 + }, + { + "epoch": 0.08439865955070125, + "grad_norm": 0.15057311866648648, + "learning_rate": 4.995702776285385e-06, + "loss": 0.8195, + "step": 170 + }, + { + "epoch": 0.08489512225394068, + "grad_norm": 0.1462021842716386, + "learning_rate": 4.995645305676558e-06, + "loss": 0.8054, + "step": 171 + }, + { + "epoch": 0.08539158495718009, + "grad_norm": 0.15612484218678557, + "learning_rate": 4.995587453648359e-06, + "loss": 0.8538, + "step": 172 + }, + { + "epoch": 0.0858880476604195, + "grad_norm": 0.15826299784330564, + "learning_rate": 4.995529220209633e-06, + "loss": 0.7967, + "step": 173 + }, + { + "epoch": 0.08638451036365893, + "grad_norm": 0.14934453023970579, + "learning_rate": 4.9954706053692766e-06, + "loss": 0.8191, + "step": 174 + }, + { + "epoch": 0.08688097306689835, + "grad_norm": 0.14487398595953221, + "learning_rate": 4.995411609136252e-06, + "loss": 0.7913, + "step": 175 + }, + { + "epoch": 0.08737743577013776, + "grad_norm": 0.2361253677063094, + "learning_rate": 4.995352231519572e-06, + "loss": 0.8506, + "step": 176 + }, + { + "epoch": 0.08787389847337719, + "grad_norm": 0.1542240974761206, + "learning_rate": 4.995292472528315e-06, + "loss": 0.8198, + "step": 177 + }, + { + "epoch": 0.0883703611766166, + "grad_norm": 0.1533907925673139, + "learning_rate": 4.9952323321716114e-06, + "loss": 0.8095, + "step": 178 + }, + { + "epoch": 0.08886682387985602, + "grad_norm": 0.14883299168355707, + "learning_rate": 4.995171810458654e-06, + "loss": 0.8492, + "step": 179 + }, + { + "epoch": 0.08936328658309545, + "grad_norm": 0.14717378708658913, + "learning_rate": 4.995110907398693e-06, + "loss": 0.7989, + "step": 180 + }, + { + "epoch": 0.08985974928633486, + "grad_norm": 0.1432697352230233, + "learning_rate": 4.995049623001036e-06, + "loss": 0.8112, + "step": 181 + }, + { + "epoch": 0.09035621198957429, + "grad_norm": 0.14420342048963478, + "learning_rate": 4.994987957275048e-06, + "loss": 0.8113, + "step": 182 + }, + { + "epoch": 0.0908526746928137, + "grad_norm": 0.1484614971553416, + "learning_rate": 4.994925910230156e-06, + "loss": 0.7928, + "step": 183 + }, + { + "epoch": 0.09134913739605312, + "grad_norm": 0.14303299403206174, + "learning_rate": 4.994863481875842e-06, + "loss": 0.821, + "step": 184 + }, + { + "epoch": 0.09184560009929255, + "grad_norm": 0.14990402029979874, + "learning_rate": 4.9948006722216456e-06, + "loss": 0.7919, + "step": 185 + }, + { + "epoch": 0.09234206280253196, + "grad_norm": 0.15676689121485238, + "learning_rate": 4.9947374812771675e-06, + "loss": 0.8475, + "step": 186 + }, + { + "epoch": 0.09283852550577137, + "grad_norm": 0.14836071218925942, + "learning_rate": 4.994673909052067e-06, + "loss": 0.81, + "step": 187 + }, + { + "epoch": 0.0933349882090108, + "grad_norm": 0.14051092402883178, + "learning_rate": 4.994609955556057e-06, + "loss": 0.7896, + "step": 188 + }, + { + "epoch": 0.09383145091225022, + "grad_norm": 0.13964829094096, + "learning_rate": 4.994545620798914e-06, + "loss": 0.8095, + "step": 189 + }, + { + "epoch": 0.09432791361548963, + "grad_norm": 0.13916395003469767, + "learning_rate": 4.994480904790469e-06, + "loss": 0.7818, + "step": 190 + }, + { + "epoch": 0.09482437631872906, + "grad_norm": 0.13992841292678654, + "learning_rate": 4.994415807540616e-06, + "loss": 0.8511, + "step": 191 + }, + { + "epoch": 0.09532083902196847, + "grad_norm": 0.1382517091115567, + "learning_rate": 4.9943503290593e-06, + "loss": 0.804, + "step": 192 + }, + { + "epoch": 0.09581730172520789, + "grad_norm": 0.13828160588381802, + "learning_rate": 4.99428446935653e-06, + "loss": 0.7945, + "step": 193 + }, + { + "epoch": 0.09631376442844732, + "grad_norm": 0.14059132138612335, + "learning_rate": 4.9942182284423715e-06, + "loss": 0.8127, + "step": 194 + }, + { + "epoch": 0.09681022713168673, + "grad_norm": 0.14342500046524775, + "learning_rate": 4.994151606326949e-06, + "loss": 0.7869, + "step": 195 + }, + { + "epoch": 0.09730668983492616, + "grad_norm": 0.15328257822811497, + "learning_rate": 4.994084603020444e-06, + "loss": 0.7969, + "step": 196 + }, + { + "epoch": 0.09780315253816557, + "grad_norm": 0.15036264302541547, + "learning_rate": 4.9940172185330975e-06, + "loss": 0.8021, + "step": 197 + }, + { + "epoch": 0.09829961524140499, + "grad_norm": 0.13631880610103, + "learning_rate": 4.993949452875208e-06, + "loss": 0.8062, + "step": 198 + }, + { + "epoch": 0.09879607794464441, + "grad_norm": 0.1370149045617051, + "learning_rate": 4.993881306057131e-06, + "loss": 0.7964, + "step": 199 + }, + { + "epoch": 0.09929254064788383, + "grad_norm": 0.14003683139998516, + "learning_rate": 4.993812778089283e-06, + "loss": 0.7829, + "step": 200 + }, + { + "epoch": 0.09978900335112324, + "grad_norm": 0.13364321729142267, + "learning_rate": 4.993743868982137e-06, + "loss": 0.7435, + "step": 201 + }, + { + "epoch": 0.10028546605436267, + "grad_norm": 0.1432644408384139, + "learning_rate": 4.993674578746225e-06, + "loss": 0.8404, + "step": 202 + }, + { + "epoch": 0.10078192875760209, + "grad_norm": 0.14022940831071956, + "learning_rate": 4.9936049073921365e-06, + "loss": 0.7916, + "step": 203 + }, + { + "epoch": 0.1012783914608415, + "grad_norm": 0.1431480895121353, + "learning_rate": 4.99353485493052e-06, + "loss": 0.7825, + "step": 204 + }, + { + "epoch": 0.10177485416408093, + "grad_norm": 0.1537315485269007, + "learning_rate": 4.993464421372081e-06, + "loss": 0.8018, + "step": 205 + }, + { + "epoch": 0.10227131686732034, + "grad_norm": 0.13149782403488902, + "learning_rate": 4.993393606727587e-06, + "loss": 0.748, + "step": 206 + }, + { + "epoch": 0.10276777957055976, + "grad_norm": 0.13795599982306228, + "learning_rate": 4.993322411007857e-06, + "loss": 0.7745, + "step": 207 + }, + { + "epoch": 0.10326424227379918, + "grad_norm": 0.14263948789480718, + "learning_rate": 4.993250834223774e-06, + "loss": 0.8286, + "step": 208 + }, + { + "epoch": 0.1037607049770386, + "grad_norm": 0.1322915151191568, + "learning_rate": 4.9931788763862774e-06, + "loss": 0.7765, + "step": 209 + }, + { + "epoch": 0.10425716768027801, + "grad_norm": 0.1411982251684298, + "learning_rate": 4.993106537506365e-06, + "loss": 0.7978, + "step": 210 + }, + { + "epoch": 0.10475363038351744, + "grad_norm": 0.14287120534895575, + "learning_rate": 4.993033817595092e-06, + "loss": 0.8119, + "step": 211 + }, + { + "epoch": 0.10525009308675685, + "grad_norm": 0.1457313365849308, + "learning_rate": 4.992960716663572e-06, + "loss": 0.8391, + "step": 212 + }, + { + "epoch": 0.10574655578999628, + "grad_norm": 0.1374862675337484, + "learning_rate": 4.992887234722978e-06, + "loss": 0.8284, + "step": 213 + }, + { + "epoch": 0.1062430184932357, + "grad_norm": 0.13532768487001728, + "learning_rate": 4.992813371784542e-06, + "loss": 0.7933, + "step": 214 + }, + { + "epoch": 0.10673948119647511, + "grad_norm": 0.14619055294203817, + "learning_rate": 4.99273912785955e-06, + "loss": 0.8232, + "step": 215 + }, + { + "epoch": 0.10723594389971454, + "grad_norm": 0.13334770368704213, + "learning_rate": 4.992664502959351e-06, + "loss": 0.7987, + "step": 216 + }, + { + "epoch": 0.10773240660295395, + "grad_norm": 0.1386527129272213, + "learning_rate": 4.99258949709535e-06, + "loss": 0.8385, + "step": 217 + }, + { + "epoch": 0.10822886930619337, + "grad_norm": 0.13307297628479753, + "learning_rate": 4.99251411027901e-06, + "loss": 0.7945, + "step": 218 + }, + { + "epoch": 0.1087253320094328, + "grad_norm": 0.1405179423764166, + "learning_rate": 4.992438342521851e-06, + "loss": 0.8591, + "step": 219 + }, + { + "epoch": 0.10922179471267221, + "grad_norm": 0.14156609276835416, + "learning_rate": 4.992362193835456e-06, + "loss": 0.8333, + "step": 220 + }, + { + "epoch": 0.10971825741591162, + "grad_norm": 0.135514778316713, + "learning_rate": 4.992285664231462e-06, + "loss": 0.7784, + "step": 221 + }, + { + "epoch": 0.11021472011915105, + "grad_norm": 0.13722582342328407, + "learning_rate": 4.992208753721564e-06, + "loss": 0.8339, + "step": 222 + }, + { + "epoch": 0.11071118282239047, + "grad_norm": 0.1354247771990746, + "learning_rate": 4.992131462317518e-06, + "loss": 0.828, + "step": 223 + }, + { + "epoch": 0.11120764552562988, + "grad_norm": 0.13994897440630472, + "learning_rate": 4.992053790031136e-06, + "loss": 0.8453, + "step": 224 + }, + { + "epoch": 0.11170410822886931, + "grad_norm": 0.13384734719071256, + "learning_rate": 4.9919757368742895e-06, + "loss": 0.8066, + "step": 225 + }, + { + "epoch": 0.11220057093210872, + "grad_norm": 0.13127250934166673, + "learning_rate": 4.991897302858908e-06, + "loss": 0.7916, + "step": 226 + }, + { + "epoch": 0.11269703363534815, + "grad_norm": 0.14039582163447972, + "learning_rate": 4.9918184879969765e-06, + "loss": 0.8656, + "step": 227 + }, + { + "epoch": 0.11319349633858757, + "grad_norm": 0.1470635342507985, + "learning_rate": 4.991739292300544e-06, + "loss": 0.8278, + "step": 228 + }, + { + "epoch": 0.11368995904182698, + "grad_norm": 0.14174626233120474, + "learning_rate": 4.991659715781712e-06, + "loss": 0.823, + "step": 229 + }, + { + "epoch": 0.11418642174506641, + "grad_norm": 0.13862626411027507, + "learning_rate": 4.991579758452644e-06, + "loss": 0.8278, + "step": 230 + }, + { + "epoch": 0.11468288444830582, + "grad_norm": 0.13959224442195398, + "learning_rate": 4.991499420325558e-06, + "loss": 0.8252, + "step": 231 + }, + { + "epoch": 0.11517934715154524, + "grad_norm": 0.13884826767573266, + "learning_rate": 4.991418701412735e-06, + "loss": 0.8404, + "step": 232 + }, + { + "epoch": 0.11567580985478466, + "grad_norm": 0.1389057416612581, + "learning_rate": 4.991337601726509e-06, + "loss": 0.8106, + "step": 233 + }, + { + "epoch": 0.11617227255802408, + "grad_norm": 0.13638785848050972, + "learning_rate": 4.991256121279277e-06, + "loss": 0.8069, + "step": 234 + }, + { + "epoch": 0.11666873526126349, + "grad_norm": 0.1338672028203034, + "learning_rate": 4.991174260083491e-06, + "loss": 0.8019, + "step": 235 + }, + { + "epoch": 0.11716519796450292, + "grad_norm": 0.14097764226298978, + "learning_rate": 4.991092018151663e-06, + "loss": 0.846, + "step": 236 + }, + { + "epoch": 0.11766166066774233, + "grad_norm": 0.13455102270597805, + "learning_rate": 4.991009395496361e-06, + "loss": 0.8056, + "step": 237 + }, + { + "epoch": 0.11815812337098175, + "grad_norm": 0.14555871706504694, + "learning_rate": 4.9909263921302135e-06, + "loss": 0.8075, + "step": 238 + }, + { + "epoch": 0.11865458607422118, + "grad_norm": 0.14461201544386468, + "learning_rate": 4.990843008065905e-06, + "loss": 0.8704, + "step": 239 + }, + { + "epoch": 0.11915104877746059, + "grad_norm": 0.1419964925817203, + "learning_rate": 4.9907592433161815e-06, + "loss": 0.792, + "step": 240 + }, + { + "epoch": 0.1196475114807, + "grad_norm": 0.13759060310549415, + "learning_rate": 4.990675097893843e-06, + "loss": 0.7836, + "step": 241 + }, + { + "epoch": 0.12014397418393943, + "grad_norm": 0.14605868485481172, + "learning_rate": 4.9905905718117505e-06, + "loss": 0.836, + "step": 242 + }, + { + "epoch": 0.12064043688717885, + "grad_norm": 0.1373934253609785, + "learning_rate": 4.990505665082824e-06, + "loss": 0.7723, + "step": 243 + }, + { + "epoch": 0.12113689959041828, + "grad_norm": 0.13865838334876657, + "learning_rate": 4.9904203777200375e-06, + "loss": 0.7898, + "step": 244 + }, + { + "epoch": 0.12163336229365769, + "grad_norm": 0.1390399518577959, + "learning_rate": 4.990334709736428e-06, + "loss": 0.7869, + "step": 245 + }, + { + "epoch": 0.1221298249968971, + "grad_norm": 0.14077068851100413, + "learning_rate": 4.990248661145087e-06, + "loss": 0.8108, + "step": 246 + }, + { + "epoch": 0.12262628770013653, + "grad_norm": 0.1337469353425711, + "learning_rate": 4.9901622319591665e-06, + "loss": 0.8248, + "step": 247 + }, + { + "epoch": 0.12312275040337595, + "grad_norm": 0.13685347748788432, + "learning_rate": 4.9900754221918766e-06, + "loss": 0.8252, + "step": 248 + }, + { + "epoch": 0.12361921310661536, + "grad_norm": 0.13611377917936257, + "learning_rate": 4.989988231856483e-06, + "loss": 0.7827, + "step": 249 + }, + { + "epoch": 0.12411567580985479, + "grad_norm": 0.13974150757425385, + "learning_rate": 4.989900660966312e-06, + "loss": 0.8795, + "step": 250 + }, + { + "epoch": 0.1246121385130942, + "grad_norm": 0.13278222798977063, + "learning_rate": 4.9898127095347475e-06, + "loss": 0.8178, + "step": 251 + }, + { + "epoch": 0.12510860121633363, + "grad_norm": 0.13505628901465874, + "learning_rate": 4.989724377575231e-06, + "loss": 0.7909, + "step": 252 + }, + { + "epoch": 0.12560506391957305, + "grad_norm": 0.1366672871248607, + "learning_rate": 4.989635665101263e-06, + "loss": 0.8262, + "step": 253 + }, + { + "epoch": 0.12610152662281246, + "grad_norm": 0.14405211149440367, + "learning_rate": 4.989546572126402e-06, + "loss": 0.8606, + "step": 254 + }, + { + "epoch": 0.12659798932605187, + "grad_norm": 0.14227617759563377, + "learning_rate": 4.9894570986642655e-06, + "loss": 0.8379, + "step": 255 + }, + { + "epoch": 0.1270944520292913, + "grad_norm": 0.1469637596335691, + "learning_rate": 4.989367244728526e-06, + "loss": 0.8578, + "step": 256 + }, + { + "epoch": 0.12759091473253073, + "grad_norm": 0.13275558591812953, + "learning_rate": 4.989277010332917e-06, + "loss": 0.7559, + "step": 257 + }, + { + "epoch": 0.12808737743577014, + "grad_norm": 0.14791426685196862, + "learning_rate": 4.989186395491229e-06, + "loss": 0.844, + "step": 258 + }, + { + "epoch": 0.12858384013900956, + "grad_norm": 0.13244945426039428, + "learning_rate": 4.989095400217312e-06, + "loss": 0.8062, + "step": 259 + }, + { + "epoch": 0.12908030284224897, + "grad_norm": 0.13778000814954997, + "learning_rate": 4.9890040245250725e-06, + "loss": 0.8399, + "step": 260 + }, + { + "epoch": 0.1295767655454884, + "grad_norm": 0.143797201748363, + "learning_rate": 4.9889122684284765e-06, + "loss": 0.7978, + "step": 261 + }, + { + "epoch": 0.1300732282487278, + "grad_norm": 0.13685020353927613, + "learning_rate": 4.988820131941547e-06, + "loss": 0.7712, + "step": 262 + }, + { + "epoch": 0.13056969095196724, + "grad_norm": 0.13664331543020586, + "learning_rate": 4.988727615078365e-06, + "loss": 0.7458, + "step": 263 + }, + { + "epoch": 0.13106615365520666, + "grad_norm": 0.1499586905570859, + "learning_rate": 4.988634717853071e-06, + "loss": 0.8261, + "step": 264 + }, + { + "epoch": 0.13156261635844607, + "grad_norm": 0.13982088441840343, + "learning_rate": 4.988541440279862e-06, + "loss": 0.8105, + "step": 265 + }, + { + "epoch": 0.13205907906168549, + "grad_norm": 0.13605433414869017, + "learning_rate": 4.988447782372996e-06, + "loss": 0.7822, + "step": 266 + }, + { + "epoch": 0.1325555417649249, + "grad_norm": 0.13252713875584649, + "learning_rate": 4.988353744146784e-06, + "loss": 0.7532, + "step": 267 + }, + { + "epoch": 0.13305200446816434, + "grad_norm": 0.140502196927792, + "learning_rate": 4.988259325615601e-06, + "loss": 0.8733, + "step": 268 + }, + { + "epoch": 0.13354846717140376, + "grad_norm": 0.13822527446456542, + "learning_rate": 4.988164526793877e-06, + "loss": 0.7752, + "step": 269 + }, + { + "epoch": 0.13404492987464317, + "grad_norm": 0.13966558823867067, + "learning_rate": 4.988069347696098e-06, + "loss": 0.8037, + "step": 270 + }, + { + "epoch": 0.13454139257788258, + "grad_norm": 0.14070793181761732, + "learning_rate": 4.987973788336814e-06, + "loss": 0.801, + "step": 271 + }, + { + "epoch": 0.135037855281122, + "grad_norm": 0.13548015995353577, + "learning_rate": 4.987877848730627e-06, + "loss": 0.7917, + "step": 272 + }, + { + "epoch": 0.1355343179843614, + "grad_norm": 0.13914099615548198, + "learning_rate": 4.987781528892201e-06, + "loss": 0.7604, + "step": 273 + }, + { + "epoch": 0.13603078068760086, + "grad_norm": 0.1439655316233177, + "learning_rate": 4.987684828836257e-06, + "loss": 0.8128, + "step": 274 + }, + { + "epoch": 0.13652724339084027, + "grad_norm": 0.13334244239454465, + "learning_rate": 4.987587748577574e-06, + "loss": 0.7762, + "step": 275 + }, + { + "epoch": 0.13702370609407968, + "grad_norm": 0.13033286848134193, + "learning_rate": 4.98749028813099e-06, + "loss": 0.794, + "step": 276 + }, + { + "epoch": 0.1375201687973191, + "grad_norm": 0.1263283049211498, + "learning_rate": 4.987392447511398e-06, + "loss": 0.7245, + "step": 277 + }, + { + "epoch": 0.1380166315005585, + "grad_norm": 0.1346177373563915, + "learning_rate": 4.987294226733753e-06, + "loss": 0.825, + "step": 278 + }, + { + "epoch": 0.13851309420379793, + "grad_norm": 0.13797046885885128, + "learning_rate": 4.987195625813066e-06, + "loss": 0.7799, + "step": 279 + }, + { + "epoch": 0.13900955690703737, + "grad_norm": 0.13671450378929442, + "learning_rate": 4.987096644764407e-06, + "loss": 0.8186, + "step": 280 + }, + { + "epoch": 0.13950601961027678, + "grad_norm": 0.13733243307369106, + "learning_rate": 4.986997283602903e-06, + "loss": 0.8646, + "step": 281 + }, + { + "epoch": 0.1400024823135162, + "grad_norm": 0.13818658844648463, + "learning_rate": 4.986897542343741e-06, + "loss": 0.8053, + "step": 282 + }, + { + "epoch": 0.1404989450167556, + "grad_norm": 0.1376323705656438, + "learning_rate": 4.9867974210021634e-06, + "loss": 0.8298, + "step": 283 + }, + { + "epoch": 0.14099540771999503, + "grad_norm": 0.13132512407016342, + "learning_rate": 4.986696919593473e-06, + "loss": 0.7656, + "step": 284 + }, + { + "epoch": 0.14149187042323447, + "grad_norm": 0.135693807444876, + "learning_rate": 4.986596038133029e-06, + "loss": 0.8259, + "step": 285 + }, + { + "epoch": 0.14198833312647388, + "grad_norm": 0.14566445753533755, + "learning_rate": 4.986494776636251e-06, + "loss": 0.8459, + "step": 286 + }, + { + "epoch": 0.1424847958297133, + "grad_norm": 0.14082471675828692, + "learning_rate": 4.986393135118614e-06, + "loss": 0.8634, + "step": 287 + }, + { + "epoch": 0.1429812585329527, + "grad_norm": 0.13675780982750446, + "learning_rate": 4.9862911135956525e-06, + "loss": 0.7724, + "step": 288 + }, + { + "epoch": 0.14347772123619212, + "grad_norm": 0.1387994309453966, + "learning_rate": 4.986188712082959e-06, + "loss": 0.7802, + "step": 289 + }, + { + "epoch": 0.14397418393943154, + "grad_norm": 0.13354684023511523, + "learning_rate": 4.986085930596184e-06, + "loss": 0.8134, + "step": 290 + }, + { + "epoch": 0.14447064664267098, + "grad_norm": 0.14594678291314142, + "learning_rate": 4.985982769151035e-06, + "loss": 0.799, + "step": 291 + }, + { + "epoch": 0.1449671093459104, + "grad_norm": 0.14334448087535562, + "learning_rate": 4.985879227763281e-06, + "loss": 0.8697, + "step": 292 + }, + { + "epoch": 0.1454635720491498, + "grad_norm": 0.13442178316157122, + "learning_rate": 4.985775306448743e-06, + "loss": 0.794, + "step": 293 + }, + { + "epoch": 0.14596003475238922, + "grad_norm": 0.14958434833387912, + "learning_rate": 4.985671005223308e-06, + "loss": 0.8104, + "step": 294 + }, + { + "epoch": 0.14645649745562864, + "grad_norm": 0.14089328532302936, + "learning_rate": 4.985566324102913e-06, + "loss": 0.8334, + "step": 295 + }, + { + "epoch": 0.14695296015886805, + "grad_norm": 0.14930089897209584, + "learning_rate": 4.98546126310356e-06, + "loss": 0.8472, + "step": 296 + }, + { + "epoch": 0.1474494228621075, + "grad_norm": 0.13838634635947197, + "learning_rate": 4.9853558222413025e-06, + "loss": 0.8081, + "step": 297 + }, + { + "epoch": 0.1479458855653469, + "grad_norm": 0.13657833547456122, + "learning_rate": 4.985250001532258e-06, + "loss": 0.8046, + "step": 298 + }, + { + "epoch": 0.14844234826858632, + "grad_norm": 0.13416933719640858, + "learning_rate": 4.9851438009925985e-06, + "loss": 0.7718, + "step": 299 + }, + { + "epoch": 0.14893881097182574, + "grad_norm": 0.13305701512405635, + "learning_rate": 4.985037220638556e-06, + "loss": 0.7332, + "step": 300 + }, + { + "epoch": 0.14943527367506515, + "grad_norm": 0.12255084911798615, + "learning_rate": 4.9849302604864176e-06, + "loss": 0.7301, + "step": 301 + }, + { + "epoch": 0.1499317363783046, + "grad_norm": 0.1349582502588026, + "learning_rate": 4.9848229205525325e-06, + "loss": 0.8268, + "step": 302 + }, + { + "epoch": 0.150428199081544, + "grad_norm": 0.13941501409088294, + "learning_rate": 4.984715200853305e-06, + "loss": 0.8187, + "step": 303 + }, + { + "epoch": 0.15092466178478342, + "grad_norm": 0.13492743679489108, + "learning_rate": 4.9846071014051985e-06, + "loss": 0.7948, + "step": 304 + }, + { + "epoch": 0.15142112448802283, + "grad_norm": 0.1652920462764793, + "learning_rate": 4.984498622224734e-06, + "loss": 0.834, + "step": 305 + }, + { + "epoch": 0.15191758719126225, + "grad_norm": 0.13639513644830636, + "learning_rate": 4.984389763328491e-06, + "loss": 0.8263, + "step": 306 + }, + { + "epoch": 0.15241404989450166, + "grad_norm": 0.13894535392298077, + "learning_rate": 4.984280524733107e-06, + "loss": 0.8399, + "step": 307 + }, + { + "epoch": 0.1529105125977411, + "grad_norm": 0.1427172968985807, + "learning_rate": 4.984170906455277e-06, + "loss": 0.8176, + "step": 308 + }, + { + "epoch": 0.15340697530098052, + "grad_norm": 0.13916573482217912, + "learning_rate": 4.984060908511755e-06, + "loss": 0.84, + "step": 309 + }, + { + "epoch": 0.15390343800421993, + "grad_norm": 0.13640696194757118, + "learning_rate": 4.983950530919352e-06, + "loss": 0.8378, + "step": 310 + }, + { + "epoch": 0.15439990070745935, + "grad_norm": 0.1269776959706502, + "learning_rate": 4.983839773694937e-06, + "loss": 0.7752, + "step": 311 + }, + { + "epoch": 0.15489636341069876, + "grad_norm": 0.13735768715592372, + "learning_rate": 4.983728636855438e-06, + "loss": 0.8528, + "step": 312 + }, + { + "epoch": 0.1553928261139382, + "grad_norm": 0.13601796887750978, + "learning_rate": 4.983617120417841e-06, + "loss": 0.7813, + "step": 313 + }, + { + "epoch": 0.15588928881717762, + "grad_norm": 0.1394868538656493, + "learning_rate": 4.983505224399188e-06, + "loss": 0.8121, + "step": 314 + }, + { + "epoch": 0.15638575152041703, + "grad_norm": 0.1393241198294074, + "learning_rate": 4.983392948816582e-06, + "loss": 0.8167, + "step": 315 + }, + { + "epoch": 0.15688221422365645, + "grad_norm": 0.13601273115897386, + "learning_rate": 4.9832802936871815e-06, + "loss": 0.815, + "step": 316 + }, + { + "epoch": 0.15737867692689586, + "grad_norm": 0.13309082027141247, + "learning_rate": 4.983167259028205e-06, + "loss": 0.7845, + "step": 317 + }, + { + "epoch": 0.15787513963013527, + "grad_norm": 0.1328710618339079, + "learning_rate": 4.983053844856928e-06, + "loss": 0.837, + "step": 318 + }, + { + "epoch": 0.15837160233337472, + "grad_norm": 0.1382512821366624, + "learning_rate": 4.982940051190682e-06, + "loss": 0.7642, + "step": 319 + }, + { + "epoch": 0.15886806503661413, + "grad_norm": 0.13347990081206224, + "learning_rate": 4.982825878046862e-06, + "loss": 0.7786, + "step": 320 + }, + { + "epoch": 0.15936452773985355, + "grad_norm": 0.1350212390870091, + "learning_rate": 4.9827113254429144e-06, + "loss": 0.7822, + "step": 321 + }, + { + "epoch": 0.15986099044309296, + "grad_norm": 0.13659963300872746, + "learning_rate": 4.982596393396348e-06, + "loss": 0.7519, + "step": 322 + }, + { + "epoch": 0.16035745314633237, + "grad_norm": 0.14307889455512066, + "learning_rate": 4.982481081924728e-06, + "loss": 0.8533, + "step": 323 + }, + { + "epoch": 0.1608539158495718, + "grad_norm": 0.1330601424580335, + "learning_rate": 4.982365391045679e-06, + "loss": 0.8239, + "step": 324 + }, + { + "epoch": 0.16135037855281123, + "grad_norm": 0.15026890457234857, + "learning_rate": 4.982249320776882e-06, + "loss": 0.8806, + "step": 325 + }, + { + "epoch": 0.16184684125605064, + "grad_norm": 0.12974893713522884, + "learning_rate": 4.982132871136075e-06, + "loss": 0.7378, + "step": 326 + }, + { + "epoch": 0.16234330395929006, + "grad_norm": 0.1444247544686782, + "learning_rate": 4.9820160421410575e-06, + "loss": 0.8243, + "step": 327 + }, + { + "epoch": 0.16283976666252947, + "grad_norm": 0.1323450801185765, + "learning_rate": 4.981898833809684e-06, + "loss": 0.7696, + "step": 328 + }, + { + "epoch": 0.1633362293657689, + "grad_norm": 0.1353207749212687, + "learning_rate": 4.981781246159867e-06, + "loss": 0.7677, + "step": 329 + }, + { + "epoch": 0.16383269206900833, + "grad_norm": 0.12787497196232483, + "learning_rate": 4.98166327920958e-06, + "loss": 0.7655, + "step": 330 + }, + { + "epoch": 0.16432915477224774, + "grad_norm": 0.13359250170701767, + "learning_rate": 4.9815449329768505e-06, + "loss": 0.8392, + "step": 331 + }, + { + "epoch": 0.16482561747548716, + "grad_norm": 0.13162070870865217, + "learning_rate": 4.981426207479767e-06, + "loss": 0.8023, + "step": 332 + }, + { + "epoch": 0.16532208017872657, + "grad_norm": 0.1340847314964679, + "learning_rate": 4.981307102736474e-06, + "loss": 0.8049, + "step": 333 + }, + { + "epoch": 0.16581854288196599, + "grad_norm": 0.13799078318837465, + "learning_rate": 4.981187618765175e-06, + "loss": 0.801, + "step": 334 + }, + { + "epoch": 0.1663150055852054, + "grad_norm": 0.1352495083171962, + "learning_rate": 4.981067755584131e-06, + "loss": 0.8028, + "step": 335 + }, + { + "epoch": 0.16681146828844484, + "grad_norm": 0.13424131175066562, + "learning_rate": 4.9809475132116624e-06, + "loss": 0.8148, + "step": 336 + }, + { + "epoch": 0.16730793099168426, + "grad_norm": 0.13380623256179228, + "learning_rate": 4.980826891666145e-06, + "loss": 0.7906, + "step": 337 + }, + { + "epoch": 0.16780439369492367, + "grad_norm": 0.13711848434016552, + "learning_rate": 4.980705890966014e-06, + "loss": 0.7818, + "step": 338 + }, + { + "epoch": 0.16830085639816308, + "grad_norm": 0.1408936385231929, + "learning_rate": 4.980584511129763e-06, + "loss": 0.8319, + "step": 339 + }, + { + "epoch": 0.1687973191014025, + "grad_norm": 0.1361241205412377, + "learning_rate": 4.980462752175943e-06, + "loss": 0.7816, + "step": 340 + }, + { + "epoch": 0.1692937818046419, + "grad_norm": 0.13336550378881848, + "learning_rate": 4.980340614123162e-06, + "loss": 0.7656, + "step": 341 + }, + { + "epoch": 0.16979024450788135, + "grad_norm": 0.1291163574188604, + "learning_rate": 4.980218096990087e-06, + "loss": 0.7573, + "step": 342 + }, + { + "epoch": 0.17028670721112077, + "grad_norm": 0.1331261115959964, + "learning_rate": 4.980095200795443e-06, + "loss": 0.7838, + "step": 343 + }, + { + "epoch": 0.17078316991436018, + "grad_norm": 0.13606378865652022, + "learning_rate": 4.979971925558014e-06, + "loss": 0.817, + "step": 344 + }, + { + "epoch": 0.1712796326175996, + "grad_norm": 0.12897271331540627, + "learning_rate": 4.979848271296639e-06, + "loss": 0.7615, + "step": 345 + }, + { + "epoch": 0.171776095320839, + "grad_norm": 0.14023828743325562, + "learning_rate": 4.979724238030217e-06, + "loss": 0.7834, + "step": 346 + }, + { + "epoch": 0.17227255802407845, + "grad_norm": 0.13830905999777027, + "learning_rate": 4.979599825777704e-06, + "loss": 0.8302, + "step": 347 + }, + { + "epoch": 0.17276902072731787, + "grad_norm": 0.1428666780776237, + "learning_rate": 4.979475034558115e-06, + "loss": 0.8588, + "step": 348 + }, + { + "epoch": 0.17326548343055728, + "grad_norm": 0.128916884054928, + "learning_rate": 4.979349864390523e-06, + "loss": 0.7776, + "step": 349 + }, + { + "epoch": 0.1737619461337967, + "grad_norm": 0.13925327403441065, + "learning_rate": 4.9792243152940576e-06, + "loss": 0.8802, + "step": 350 + }, + { + "epoch": 0.1742584088370361, + "grad_norm": 0.1363578403401138, + "learning_rate": 4.979098387287907e-06, + "loss": 0.8118, + "step": 351 + }, + { + "epoch": 0.17475487154027552, + "grad_norm": 0.14256811683930512, + "learning_rate": 4.978972080391317e-06, + "loss": 0.7879, + "step": 352 + }, + { + "epoch": 0.17525133424351497, + "grad_norm": 0.13404514324949623, + "learning_rate": 4.978845394623591e-06, + "loss": 0.7738, + "step": 353 + }, + { + "epoch": 0.17574779694675438, + "grad_norm": 0.14672131580692716, + "learning_rate": 4.978718330004093e-06, + "loss": 0.8091, + "step": 354 + }, + { + "epoch": 0.1762442596499938, + "grad_norm": 0.13482860481948047, + "learning_rate": 4.978590886552241e-06, + "loss": 0.7855, + "step": 355 + }, + { + "epoch": 0.1767407223532332, + "grad_norm": 0.1390863998270458, + "learning_rate": 4.978463064287513e-06, + "loss": 0.8136, + "step": 356 + }, + { + "epoch": 0.17723718505647262, + "grad_norm": 0.12766825836109758, + "learning_rate": 4.978334863229445e-06, + "loss": 0.7519, + "step": 357 + }, + { + "epoch": 0.17773364775971204, + "grad_norm": 0.13957492610638372, + "learning_rate": 4.97820628339763e-06, + "loss": 0.8374, + "step": 358 + }, + { + "epoch": 0.17823011046295148, + "grad_norm": 0.13624109938717846, + "learning_rate": 4.97807732481172e-06, + "loss": 0.8468, + "step": 359 + }, + { + "epoch": 0.1787265731661909, + "grad_norm": 0.1371796025283825, + "learning_rate": 4.977947987491424e-06, + "loss": 0.8134, + "step": 360 + }, + { + "epoch": 0.1792230358694303, + "grad_norm": 0.13791288379959807, + "learning_rate": 4.977818271456508e-06, + "loss": 0.8523, + "step": 361 + }, + { + "epoch": 0.17971949857266972, + "grad_norm": 0.13682556966382917, + "learning_rate": 4.977688176726799e-06, + "loss": 0.7998, + "step": 362 + }, + { + "epoch": 0.18021596127590914, + "grad_norm": 0.12914631502678342, + "learning_rate": 4.977557703322178e-06, + "loss": 0.7715, + "step": 363 + }, + { + "epoch": 0.18071242397914858, + "grad_norm": 0.13450502572602166, + "learning_rate": 4.977426851262588e-06, + "loss": 0.7999, + "step": 364 + }, + { + "epoch": 0.181208886682388, + "grad_norm": 0.1324950964120947, + "learning_rate": 4.977295620568025e-06, + "loss": 0.773, + "step": 365 + }, + { + "epoch": 0.1817053493856274, + "grad_norm": 0.13291470130623328, + "learning_rate": 4.977164011258547e-06, + "loss": 0.8047, + "step": 366 + }, + { + "epoch": 0.18220181208886682, + "grad_norm": 0.14303063683318232, + "learning_rate": 4.977032023354269e-06, + "loss": 0.8361, + "step": 367 + }, + { + "epoch": 0.18269827479210624, + "grad_norm": 0.14299058880067156, + "learning_rate": 4.976899656875361e-06, + "loss": 0.8172, + "step": 368 + }, + { + "epoch": 0.18319473749534565, + "grad_norm": 0.13663382187886108, + "learning_rate": 4.976766911842056e-06, + "loss": 0.7529, + "step": 369 + }, + { + "epoch": 0.1836912001985851, + "grad_norm": 0.13831178738187494, + "learning_rate": 4.9766337882746395e-06, + "loss": 0.7863, + "step": 370 + }, + { + "epoch": 0.1841876629018245, + "grad_norm": 0.13456481368960774, + "learning_rate": 4.976500286193458e-06, + "loss": 0.8505, + "step": 371 + }, + { + "epoch": 0.18468412560506392, + "grad_norm": 0.13582472070675122, + "learning_rate": 4.976366405618916e-06, + "loss": 0.8036, + "step": 372 + }, + { + "epoch": 0.18518058830830333, + "grad_norm": 0.13588434233209187, + "learning_rate": 4.976232146571476e-06, + "loss": 0.7814, + "step": 373 + }, + { + "epoch": 0.18567705101154275, + "grad_norm": 0.13408060178137982, + "learning_rate": 4.976097509071654e-06, + "loss": 0.824, + "step": 374 + }, + { + "epoch": 0.1861735137147822, + "grad_norm": 0.13739925696360844, + "learning_rate": 4.975962493140029e-06, + "loss": 0.8233, + "step": 375 + }, + { + "epoch": 0.1866699764180216, + "grad_norm": 0.12982209493306304, + "learning_rate": 4.9758270987972356e-06, + "loss": 0.7537, + "step": 376 + }, + { + "epoch": 0.18716643912126102, + "grad_norm": 0.13310617104329844, + "learning_rate": 4.975691326063968e-06, + "loss": 0.7771, + "step": 377 + }, + { + "epoch": 0.18766290182450043, + "grad_norm": 0.13823931193705333, + "learning_rate": 4.9755551749609755e-06, + "loss": 0.7723, + "step": 378 + }, + { + "epoch": 0.18815936452773985, + "grad_norm": 0.1382970645097716, + "learning_rate": 4.975418645509066e-06, + "loss": 0.8002, + "step": 379 + }, + { + "epoch": 0.18865582723097926, + "grad_norm": 0.13785374560341188, + "learning_rate": 4.975281737729109e-06, + "loss": 0.7799, + "step": 380 + }, + { + "epoch": 0.1891522899342187, + "grad_norm": 0.13630037342754214, + "learning_rate": 4.975144451642024e-06, + "loss": 0.7825, + "step": 381 + }, + { + "epoch": 0.18964875263745812, + "grad_norm": 0.13742010823689682, + "learning_rate": 4.975006787268797e-06, + "loss": 0.7832, + "step": 382 + }, + { + "epoch": 0.19014521534069753, + "grad_norm": 0.1372266789447449, + "learning_rate": 4.974868744630467e-06, + "loss": 0.7805, + "step": 383 + }, + { + "epoch": 0.19064167804393695, + "grad_norm": 0.13335756294028897, + "learning_rate": 4.974730323748129e-06, + "loss": 0.7747, + "step": 384 + }, + { + "epoch": 0.19113814074717636, + "grad_norm": 0.1331325911806612, + "learning_rate": 4.974591524642942e-06, + "loss": 0.7626, + "step": 385 + }, + { + "epoch": 0.19163460345041577, + "grad_norm": 0.13528286793265076, + "learning_rate": 4.974452347336116e-06, + "loss": 0.7906, + "step": 386 + }, + { + "epoch": 0.19213106615365522, + "grad_norm": 0.13710197959682258, + "learning_rate": 4.974312791848925e-06, + "loss": 0.7775, + "step": 387 + }, + { + "epoch": 0.19262752885689463, + "grad_norm": 0.13646907256302476, + "learning_rate": 4.974172858202695e-06, + "loss": 0.8266, + "step": 388 + }, + { + "epoch": 0.19312399156013405, + "grad_norm": 0.14002294190339723, + "learning_rate": 4.974032546418816e-06, + "loss": 0.7807, + "step": 389 + }, + { + "epoch": 0.19362045426337346, + "grad_norm": 0.14404269798290067, + "learning_rate": 4.973891856518728e-06, + "loss": 0.8195, + "step": 390 + }, + { + "epoch": 0.19411691696661287, + "grad_norm": 0.13811976877791707, + "learning_rate": 4.973750788523937e-06, + "loss": 0.8008, + "step": 391 + }, + { + "epoch": 0.19461337966985232, + "grad_norm": 0.14072300476936328, + "learning_rate": 4.9736093424560005e-06, + "loss": 0.7705, + "step": 392 + }, + { + "epoch": 0.19510984237309173, + "grad_norm": 0.14423164849233763, + "learning_rate": 4.973467518336538e-06, + "loss": 0.7988, + "step": 393 + }, + { + "epoch": 0.19560630507633114, + "grad_norm": 0.13418208376903598, + "learning_rate": 4.973325316187225e-06, + "loss": 0.7761, + "step": 394 + }, + { + "epoch": 0.19610276777957056, + "grad_norm": 0.1316889322126393, + "learning_rate": 4.973182736029793e-06, + "loss": 0.847, + "step": 395 + }, + { + "epoch": 0.19659923048280997, + "grad_norm": 0.14201288602040998, + "learning_rate": 4.973039777886035e-06, + "loss": 0.7743, + "step": 396 + }, + { + "epoch": 0.1970956931860494, + "grad_norm": 0.14278977007281732, + "learning_rate": 4.9728964417777986e-06, + "loss": 0.7947, + "step": 397 + }, + { + "epoch": 0.19759215588928883, + "grad_norm": 0.13398922706134572, + "learning_rate": 4.972752727726992e-06, + "loss": 0.75, + "step": 398 + }, + { + "epoch": 0.19808861859252824, + "grad_norm": 0.1375355686898203, + "learning_rate": 4.972608635755577e-06, + "loss": 0.781, + "step": 399 + }, + { + "epoch": 0.19858508129576766, + "grad_norm": 0.1371602165214636, + "learning_rate": 4.972464165885579e-06, + "loss": 0.823, + "step": 400 + }, + { + "epoch": 0.19908154399900707, + "grad_norm": 0.14577906731021556, + "learning_rate": 4.972319318139074e-06, + "loss": 0.8288, + "step": 401 + }, + { + "epoch": 0.19957800670224649, + "grad_norm": 0.15599884531010932, + "learning_rate": 4.972174092538203e-06, + "loss": 0.8007, + "step": 402 + }, + { + "epoch": 0.2000744694054859, + "grad_norm": 0.13716250162699573, + "learning_rate": 4.97202848910516e-06, + "loss": 0.763, + "step": 403 + }, + { + "epoch": 0.20057093210872534, + "grad_norm": 0.13013318815804806, + "learning_rate": 4.9718825078622e-06, + "loss": 0.7654, + "step": 404 + }, + { + "epoch": 0.20106739481196476, + "grad_norm": 0.14489783733703743, + "learning_rate": 4.971736148831631e-06, + "loss": 0.8095, + "step": 405 + }, + { + "epoch": 0.20156385751520417, + "grad_norm": 0.14178082134535308, + "learning_rate": 4.971589412035823e-06, + "loss": 0.7955, + "step": 406 + }, + { + "epoch": 0.20206032021844358, + "grad_norm": 0.14527202694186261, + "learning_rate": 4.971442297497202e-06, + "loss": 0.8, + "step": 407 + }, + { + "epoch": 0.202556782921683, + "grad_norm": 0.13886094340580113, + "learning_rate": 4.971294805238252e-06, + "loss": 0.8482, + "step": 408 + }, + { + "epoch": 0.20305324562492244, + "grad_norm": 0.15143893194787292, + "learning_rate": 4.971146935281517e-06, + "loss": 0.7927, + "step": 409 + }, + { + "epoch": 0.20354970832816185, + "grad_norm": 0.1636495293029831, + "learning_rate": 4.970998687649593e-06, + "loss": 0.8153, + "step": 410 + }, + { + "epoch": 0.20404617103140127, + "grad_norm": 0.13453836074071723, + "learning_rate": 4.97085006236514e-06, + "loss": 0.7805, + "step": 411 + }, + { + "epoch": 0.20454263373464068, + "grad_norm": 0.16194606706618664, + "learning_rate": 4.970701059450872e-06, + "loss": 0.821, + "step": 412 + }, + { + "epoch": 0.2050390964378801, + "grad_norm": 0.13277679173675005, + "learning_rate": 4.970551678929562e-06, + "loss": 0.7262, + "step": 413 + }, + { + "epoch": 0.2055355591411195, + "grad_norm": 0.13842208760542138, + "learning_rate": 4.970401920824039e-06, + "loss": 0.7754, + "step": 414 + }, + { + "epoch": 0.20603202184435895, + "grad_norm": 0.13680895396236756, + "learning_rate": 4.970251785157193e-06, + "loss": 0.7984, + "step": 415 + }, + { + "epoch": 0.20652848454759837, + "grad_norm": 0.1332819441058511, + "learning_rate": 4.9701012719519694e-06, + "loss": 0.7827, + "step": 416 + }, + { + "epoch": 0.20702494725083778, + "grad_norm": 0.13155005386495136, + "learning_rate": 4.969950381231371e-06, + "loss": 0.7598, + "step": 417 + }, + { + "epoch": 0.2075214099540772, + "grad_norm": 0.13812357614760978, + "learning_rate": 4.969799113018459e-06, + "loss": 0.7805, + "step": 418 + }, + { + "epoch": 0.2080178726573166, + "grad_norm": 0.15001010565178097, + "learning_rate": 4.9696474673363536e-06, + "loss": 0.7916, + "step": 419 + }, + { + "epoch": 0.20851433536055602, + "grad_norm": 0.14065673015444682, + "learning_rate": 4.96949544420823e-06, + "loss": 0.7902, + "step": 420 + }, + { + "epoch": 0.20901079806379547, + "grad_norm": 0.13896425621312414, + "learning_rate": 4.969343043657323e-06, + "loss": 0.78, + "step": 421 + }, + { + "epoch": 0.20950726076703488, + "grad_norm": 0.14937608116406495, + "learning_rate": 4.969190265706926e-06, + "loss": 0.8289, + "step": 422 + }, + { + "epoch": 0.2100037234702743, + "grad_norm": 0.13459240584003007, + "learning_rate": 4.969037110380387e-06, + "loss": 0.7906, + "step": 423 + }, + { + "epoch": 0.2105001861735137, + "grad_norm": 0.1484626362599989, + "learning_rate": 4.968883577701112e-06, + "loss": 0.8244, + "step": 424 + }, + { + "epoch": 0.21099664887675312, + "grad_norm": 0.13015898606801415, + "learning_rate": 4.9687296676925686e-06, + "loss": 0.7431, + "step": 425 + }, + { + "epoch": 0.21149311157999257, + "grad_norm": 0.1377331802046174, + "learning_rate": 4.96857538037828e-06, + "loss": 0.8043, + "step": 426 + }, + { + "epoch": 0.21198957428323198, + "grad_norm": 0.1363174078519366, + "learning_rate": 4.968420715781823e-06, + "loss": 0.7584, + "step": 427 + }, + { + "epoch": 0.2124860369864714, + "grad_norm": 0.14109517205489622, + "learning_rate": 4.9682656739268385e-06, + "loss": 0.7868, + "step": 428 + }, + { + "epoch": 0.2129824996897108, + "grad_norm": 0.14476636902871076, + "learning_rate": 4.968110254837022e-06, + "loss": 0.8051, + "step": 429 + }, + { + "epoch": 0.21347896239295022, + "grad_norm": 0.13488134316489944, + "learning_rate": 4.967954458536126e-06, + "loss": 0.7816, + "step": 430 + }, + { + "epoch": 0.21397542509618964, + "grad_norm": 0.1311388707406287, + "learning_rate": 4.967798285047961e-06, + "loss": 0.7856, + "step": 431 + }, + { + "epoch": 0.21447188779942908, + "grad_norm": 0.13733380279723623, + "learning_rate": 4.967641734396397e-06, + "loss": 0.8495, + "step": 432 + }, + { + "epoch": 0.2149683505026685, + "grad_norm": 0.13619991947733362, + "learning_rate": 4.967484806605359e-06, + "loss": 0.7803, + "step": 433 + }, + { + "epoch": 0.2154648132059079, + "grad_norm": 0.13862255789954744, + "learning_rate": 4.967327501698831e-06, + "loss": 0.7749, + "step": 434 + }, + { + "epoch": 0.21596127590914732, + "grad_norm": 0.1451665257035334, + "learning_rate": 4.967169819700856e-06, + "loss": 0.7966, + "step": 435 + }, + { + "epoch": 0.21645773861238674, + "grad_norm": 0.13309881671792553, + "learning_rate": 4.967011760635532e-06, + "loss": 0.749, + "step": 436 + }, + { + "epoch": 0.21695420131562618, + "grad_norm": 0.14123567913488688, + "learning_rate": 4.966853324527015e-06, + "loss": 0.7611, + "step": 437 + }, + { + "epoch": 0.2174506640188656, + "grad_norm": 0.1398898539392954, + "learning_rate": 4.966694511399521e-06, + "loss": 0.8463, + "step": 438 + }, + { + "epoch": 0.217947126722105, + "grad_norm": 0.1372879573733997, + "learning_rate": 4.9665353212773215e-06, + "loss": 0.7858, + "step": 439 + }, + { + "epoch": 0.21844358942534442, + "grad_norm": 0.13654858802550288, + "learning_rate": 4.966375754184746e-06, + "loss": 0.8215, + "step": 440 + }, + { + "epoch": 0.21894005212858383, + "grad_norm": 0.1319581844818004, + "learning_rate": 4.966215810146181e-06, + "loss": 0.8257, + "step": 441 + }, + { + "epoch": 0.21943651483182325, + "grad_norm": 0.13477793443314756, + "learning_rate": 4.966055489186072e-06, + "loss": 0.8037, + "step": 442 + }, + { + "epoch": 0.2199329775350627, + "grad_norm": 0.13368048885447226, + "learning_rate": 4.965894791328924e-06, + "loss": 0.7658, + "step": 443 + }, + { + "epoch": 0.2204294402383021, + "grad_norm": 0.13455924110626613, + "learning_rate": 4.965733716599292e-06, + "loss": 0.8163, + "step": 444 + }, + { + "epoch": 0.22092590294154152, + "grad_norm": 0.1389198409505174, + "learning_rate": 4.965572265021798e-06, + "loss": 0.8417, + "step": 445 + }, + { + "epoch": 0.22142236564478093, + "grad_norm": 0.13388284038750456, + "learning_rate": 4.965410436621115e-06, + "loss": 0.7774, + "step": 446 + }, + { + "epoch": 0.22191882834802035, + "grad_norm": 0.132820811008653, + "learning_rate": 4.965248231421977e-06, + "loss": 0.8015, + "step": 447 + }, + { + "epoch": 0.22241529105125976, + "grad_norm": 0.13543397173414973, + "learning_rate": 4.965085649449175e-06, + "loss": 0.8054, + "step": 448 + }, + { + "epoch": 0.2229117537544992, + "grad_norm": 0.13391298366876886, + "learning_rate": 4.964922690727555e-06, + "loss": 0.8157, + "step": 449 + }, + { + "epoch": 0.22340821645773862, + "grad_norm": 0.13119644805742203, + "learning_rate": 4.964759355282024e-06, + "loss": 0.7737, + "step": 450 + }, + { + "epoch": 0.22390467916097803, + "grad_norm": 0.13789404014972487, + "learning_rate": 4.964595643137544e-06, + "loss": 0.8227, + "step": 451 + }, + { + "epoch": 0.22440114186421745, + "grad_norm": 0.13702173759120256, + "learning_rate": 4.964431554319138e-06, + "loss": 0.7777, + "step": 452 + }, + { + "epoch": 0.22489760456745686, + "grad_norm": 0.167958896728115, + "learning_rate": 4.964267088851883e-06, + "loss": 0.8295, + "step": 453 + }, + { + "epoch": 0.2253940672706963, + "grad_norm": 0.1362052877173801, + "learning_rate": 4.964102246760915e-06, + "loss": 0.8139, + "step": 454 + }, + { + "epoch": 0.22589052997393572, + "grad_norm": 0.13619122221235858, + "learning_rate": 4.963937028071427e-06, + "loss": 0.7996, + "step": 455 + }, + { + "epoch": 0.22638699267717513, + "grad_norm": 0.13307543777024503, + "learning_rate": 4.96377143280867e-06, + "loss": 0.7634, + "step": 456 + }, + { + "epoch": 0.22688345538041454, + "grad_norm": 0.13177673869505957, + "learning_rate": 4.963605460997954e-06, + "loss": 0.7707, + "step": 457 + }, + { + "epoch": 0.22737991808365396, + "grad_norm": 0.13107526362132932, + "learning_rate": 4.963439112664644e-06, + "loss": 0.7883, + "step": 458 + }, + { + "epoch": 0.22787638078689337, + "grad_norm": 0.13905062257304818, + "learning_rate": 4.963272387834163e-06, + "loss": 0.7869, + "step": 459 + }, + { + "epoch": 0.22837284349013282, + "grad_norm": 0.14228112591091902, + "learning_rate": 4.963105286531994e-06, + "loss": 0.7684, + "step": 460 + }, + { + "epoch": 0.22886930619337223, + "grad_norm": 0.1278341075030752, + "learning_rate": 4.962937808783675e-06, + "loss": 0.7381, + "step": 461 + }, + { + "epoch": 0.22936576889661164, + "grad_norm": 0.13836582166759392, + "learning_rate": 4.962769954614802e-06, + "loss": 0.8072, + "step": 462 + }, + { + "epoch": 0.22986223159985106, + "grad_norm": 0.13221136159222038, + "learning_rate": 4.962601724051029e-06, + "loss": 0.7667, + "step": 463 + }, + { + "epoch": 0.23035869430309047, + "grad_norm": 0.1359691390529077, + "learning_rate": 4.962433117118067e-06, + "loss": 0.8016, + "step": 464 + }, + { + "epoch": 0.2308551570063299, + "grad_norm": 0.13838185689016483, + "learning_rate": 4.962264133841686e-06, + "loss": 0.7823, + "step": 465 + }, + { + "epoch": 0.23135161970956933, + "grad_norm": 0.13217829549783422, + "learning_rate": 4.96209477424771e-06, + "loss": 0.7233, + "step": 466 + }, + { + "epoch": 0.23184808241280874, + "grad_norm": 0.12783947356664202, + "learning_rate": 4.9619250383620256e-06, + "loss": 0.7493, + "step": 467 + }, + { + "epoch": 0.23234454511604816, + "grad_norm": 0.13083482079204406, + "learning_rate": 4.961754926210572e-06, + "loss": 0.7389, + "step": 468 + }, + { + "epoch": 0.23284100781928757, + "grad_norm": 0.13464481187124352, + "learning_rate": 4.9615844378193505e-06, + "loss": 0.78, + "step": 469 + }, + { + "epoch": 0.23333747052252699, + "grad_norm": 0.13684478446270137, + "learning_rate": 4.961413573214415e-06, + "loss": 0.8143, + "step": 470 + }, + { + "epoch": 0.23383393322576643, + "grad_norm": 0.14377264892348118, + "learning_rate": 4.9612423324218816e-06, + "loss": 0.8265, + "step": 471 + }, + { + "epoch": 0.23433039592900584, + "grad_norm": 0.13245307939757578, + "learning_rate": 4.961070715467921e-06, + "loss": 0.8093, + "step": 472 + }, + { + "epoch": 0.23482685863224526, + "grad_norm": 0.1373381453121188, + "learning_rate": 4.9608987223787606e-06, + "loss": 0.7547, + "step": 473 + }, + { + "epoch": 0.23532332133548467, + "grad_norm": 0.1370145480054436, + "learning_rate": 4.960726353180688e-06, + "loss": 0.8459, + "step": 474 + }, + { + "epoch": 0.23581978403872408, + "grad_norm": 0.1375438658427002, + "learning_rate": 4.960553607900047e-06, + "loss": 0.7813, + "step": 475 + }, + { + "epoch": 0.2363162467419635, + "grad_norm": 0.13778611231342322, + "learning_rate": 4.96038048656324e-06, + "loss": 0.7739, + "step": 476 + }, + { + "epoch": 0.23681270944520294, + "grad_norm": 0.13166121819124513, + "learning_rate": 4.9602069891967245e-06, + "loss": 0.7708, + "step": 477 + }, + { + "epoch": 0.23730917214844235, + "grad_norm": 0.1396399426393052, + "learning_rate": 4.9600331158270175e-06, + "loss": 0.7725, + "step": 478 + }, + { + "epoch": 0.23780563485168177, + "grad_norm": 0.1392271735139755, + "learning_rate": 4.959858866480691e-06, + "loss": 0.7779, + "step": 479 + }, + { + "epoch": 0.23830209755492118, + "grad_norm": 0.13001276373757875, + "learning_rate": 4.959684241184379e-06, + "loss": 0.7653, + "step": 480 + }, + { + "epoch": 0.2387985602581606, + "grad_norm": 0.13464254447792998, + "learning_rate": 4.959509239964768e-06, + "loss": 0.7932, + "step": 481 + }, + { + "epoch": 0.2392950229614, + "grad_norm": 0.13597319212243217, + "learning_rate": 4.959333862848605e-06, + "loss": 0.7934, + "step": 482 + }, + { + "epoch": 0.23979148566463945, + "grad_norm": 0.14655344478202936, + "learning_rate": 4.959158109862694e-06, + "loss": 0.798, + "step": 483 + }, + { + "epoch": 0.24028794836787887, + "grad_norm": 0.1305172241900655, + "learning_rate": 4.958981981033895e-06, + "loss": 0.7791, + "step": 484 + }, + { + "epoch": 0.24078441107111828, + "grad_norm": 0.13605164916015544, + "learning_rate": 4.958805476389127e-06, + "loss": 0.8128, + "step": 485 + }, + { + "epoch": 0.2412808737743577, + "grad_norm": 0.13833174537228857, + "learning_rate": 4.958628595955366e-06, + "loss": 0.7694, + "step": 486 + }, + { + "epoch": 0.2417773364775971, + "grad_norm": 0.1358842375740616, + "learning_rate": 4.958451339759645e-06, + "loss": 0.7786, + "step": 487 + }, + { + "epoch": 0.24227379918083655, + "grad_norm": 0.13943485857687118, + "learning_rate": 4.9582737078290556e-06, + "loss": 0.8026, + "step": 488 + }, + { + "epoch": 0.24277026188407597, + "grad_norm": 0.1341222818235527, + "learning_rate": 4.958095700190745e-06, + "loss": 0.821, + "step": 489 + }, + { + "epoch": 0.24326672458731538, + "grad_norm": 0.13304092780884888, + "learning_rate": 4.957917316871919e-06, + "loss": 0.8058, + "step": 490 + }, + { + "epoch": 0.2437631872905548, + "grad_norm": 0.13446637676436832, + "learning_rate": 4.957738557899841e-06, + "loss": 0.8165, + "step": 491 + }, + { + "epoch": 0.2442596499937942, + "grad_norm": 0.1355754492629554, + "learning_rate": 4.9575594233018305e-06, + "loss": 0.84, + "step": 492 + }, + { + "epoch": 0.24475611269703362, + "grad_norm": 0.1326880486992906, + "learning_rate": 4.957379913105267e-06, + "loss": 0.7595, + "step": 493 + }, + { + "epoch": 0.24525257540027307, + "grad_norm": 0.13614000382079813, + "learning_rate": 4.957200027337585e-06, + "loss": 0.7988, + "step": 494 + }, + { + "epoch": 0.24574903810351248, + "grad_norm": 0.12773189423591907, + "learning_rate": 4.957019766026277e-06, + "loss": 0.7577, + "step": 495 + }, + { + "epoch": 0.2462455008067519, + "grad_norm": 0.14086258045001926, + "learning_rate": 4.956839129198892e-06, + "loss": 0.8078, + "step": 496 + }, + { + "epoch": 0.2467419635099913, + "grad_norm": 0.13555212844444037, + "learning_rate": 4.95665811688304e-06, + "loss": 0.8094, + "step": 497 + }, + { + "epoch": 0.24723842621323072, + "grad_norm": 0.13674333466288918, + "learning_rate": 4.9564767291063844e-06, + "loss": 0.7807, + "step": 498 + }, + { + "epoch": 0.24773488891647016, + "grad_norm": 0.1304786699944242, + "learning_rate": 4.956294965896647e-06, + "loss": 0.7614, + "step": 499 + }, + { + "epoch": 0.24823135161970958, + "grad_norm": 0.13848355027573125, + "learning_rate": 4.956112827281607e-06, + "loss": 0.8078, + "step": 500 + }, + { + "epoch": 0.248727814322949, + "grad_norm": 0.13212981378633934, + "learning_rate": 4.955930313289102e-06, + "loss": 0.7895, + "step": 501 + }, + { + "epoch": 0.2492242770261884, + "grad_norm": 0.134424879179094, + "learning_rate": 4.955747423947027e-06, + "loss": 0.8235, + "step": 502 + }, + { + "epoch": 0.24972073972942782, + "grad_norm": 0.12855995535715545, + "learning_rate": 4.955564159283334e-06, + "loss": 0.7454, + "step": 503 + }, + { + "epoch": 0.25021720243266726, + "grad_norm": 0.13938742116853411, + "learning_rate": 4.95538051932603e-06, + "loss": 0.7975, + "step": 504 + }, + { + "epoch": 0.25021720243266726, + "eval_loss": 0.7929754853248596, + "eval_runtime": 135.44, + "eval_samples_per_second": 224.107, + "eval_steps_per_second": 28.02, + "step": 504 + }, + { + "epoch": 0.25071366513590665, + "grad_norm": 0.13609034177916032, + "learning_rate": 4.9551965041031835e-06, + "loss": 0.7783, + "step": 505 + }, + { + "epoch": 0.2512101278391461, + "grad_norm": 0.13139635870746627, + "learning_rate": 4.955012113642916e-06, + "loss": 0.7706, + "step": 506 + }, + { + "epoch": 0.2517065905423855, + "grad_norm": 0.13959142837209643, + "learning_rate": 4.954827347973412e-06, + "loss": 0.8491, + "step": 507 + }, + { + "epoch": 0.2522030532456249, + "grad_norm": 0.15436767440806537, + "learning_rate": 4.954642207122907e-06, + "loss": 0.8424, + "step": 508 + }, + { + "epoch": 0.25269951594886436, + "grad_norm": 0.14047650581011453, + "learning_rate": 4.954456691119698e-06, + "loss": 0.7782, + "step": 509 + }, + { + "epoch": 0.25319597865210375, + "grad_norm": 0.13512624112535368, + "learning_rate": 4.954270799992138e-06, + "loss": 0.7552, + "step": 510 + }, + { + "epoch": 0.2536924413553432, + "grad_norm": 0.13165016338907254, + "learning_rate": 4.954084533768637e-06, + "loss": 0.7652, + "step": 511 + }, + { + "epoch": 0.2541889040585826, + "grad_norm": 0.13432352709536072, + "learning_rate": 4.953897892477664e-06, + "loss": 0.7549, + "step": 512 + }, + { + "epoch": 0.254685366761822, + "grad_norm": 0.13825583852964196, + "learning_rate": 4.953710876147743e-06, + "loss": 0.7461, + "step": 513 + }, + { + "epoch": 0.25518182946506146, + "grad_norm": 0.13800340620252008, + "learning_rate": 4.953523484807456e-06, + "loss": 0.8118, + "step": 514 + }, + { + "epoch": 0.25567829216830085, + "grad_norm": 0.1357923592978122, + "learning_rate": 4.9533357184854454e-06, + "loss": 0.8104, + "step": 515 + }, + { + "epoch": 0.2561747548715403, + "grad_norm": 0.13697526329337478, + "learning_rate": 4.953147577210406e-06, + "loss": 0.853, + "step": 516 + }, + { + "epoch": 0.2566712175747797, + "grad_norm": 0.13197113130506147, + "learning_rate": 4.952959061011091e-06, + "loss": 0.7424, + "step": 517 + }, + { + "epoch": 0.2571676802780191, + "grad_norm": 0.13803258269495886, + "learning_rate": 4.952770169916316e-06, + "loss": 0.8018, + "step": 518 + }, + { + "epoch": 0.25766414298125856, + "grad_norm": 0.13462239799861447, + "learning_rate": 4.952580903954946e-06, + "loss": 0.7604, + "step": 519 + }, + { + "epoch": 0.25816060568449795, + "grad_norm": 0.13185857943402207, + "learning_rate": 4.95239126315591e-06, + "loss": 0.7809, + "step": 520 + }, + { + "epoch": 0.2586570683877374, + "grad_norm": 0.13281620469964295, + "learning_rate": 4.95220124754819e-06, + "loss": 0.7726, + "step": 521 + }, + { + "epoch": 0.2591535310909768, + "grad_norm": 0.14105017206726814, + "learning_rate": 4.952010857160828e-06, + "loss": 0.785, + "step": 522 + }, + { + "epoch": 0.2596499937942162, + "grad_norm": 0.1439691078823198, + "learning_rate": 4.951820092022921e-06, + "loss": 0.7934, + "step": 523 + }, + { + "epoch": 0.2601464564974556, + "grad_norm": 0.1382892013792061, + "learning_rate": 4.951628952163625e-06, + "loss": 0.7892, + "step": 524 + }, + { + "epoch": 0.26064291920069504, + "grad_norm": 0.13861843984858735, + "learning_rate": 4.951437437612152e-06, + "loss": 0.8235, + "step": 525 + }, + { + "epoch": 0.2611393819039345, + "grad_norm": 0.14191335563414534, + "learning_rate": 4.951245548397773e-06, + "loss": 0.8068, + "step": 526 + }, + { + "epoch": 0.2616358446071739, + "grad_norm": 0.13893025334242587, + "learning_rate": 4.951053284549815e-06, + "loss": 0.7718, + "step": 527 + }, + { + "epoch": 0.2621323073104133, + "grad_norm": 0.128500724810349, + "learning_rate": 4.950860646097661e-06, + "loss": 0.7775, + "step": 528 + }, + { + "epoch": 0.2626287700136527, + "grad_norm": 0.14778687944277374, + "learning_rate": 4.950667633070755e-06, + "loss": 0.8259, + "step": 529 + }, + { + "epoch": 0.26312523271689214, + "grad_norm": 0.1482567507544276, + "learning_rate": 4.950474245498594e-06, + "loss": 0.751, + "step": 530 + }, + { + "epoch": 0.2636216954201316, + "grad_norm": 0.14168154296888727, + "learning_rate": 4.950280483410735e-06, + "loss": 0.7766, + "step": 531 + }, + { + "epoch": 0.26411815812337097, + "grad_norm": 0.13937007291923947, + "learning_rate": 4.950086346836792e-06, + "loss": 0.8215, + "step": 532 + }, + { + "epoch": 0.2646146208266104, + "grad_norm": 0.13948603039348756, + "learning_rate": 4.949891835806434e-06, + "loss": 0.764, + "step": 533 + }, + { + "epoch": 0.2651110835298498, + "grad_norm": 0.1398432077501712, + "learning_rate": 4.9496969503493905e-06, + "loss": 0.7917, + "step": 534 + }, + { + "epoch": 0.26560754623308924, + "grad_norm": 0.13864095868027326, + "learning_rate": 4.949501690495446e-06, + "loss": 0.7852, + "step": 535 + }, + { + "epoch": 0.2661040089363287, + "grad_norm": 0.137626101201187, + "learning_rate": 4.949306056274443e-06, + "loss": 0.7971, + "step": 536 + }, + { + "epoch": 0.26660047163956807, + "grad_norm": 0.13991644106960857, + "learning_rate": 4.949110047716281e-06, + "loss": 0.8049, + "step": 537 + }, + { + "epoch": 0.2670969343428075, + "grad_norm": 0.13755972862852978, + "learning_rate": 4.948913664850917e-06, + "loss": 0.7761, + "step": 538 + }, + { + "epoch": 0.2675933970460469, + "grad_norm": 0.14095099190685437, + "learning_rate": 4.9487169077083645e-06, + "loss": 0.7995, + "step": 539 + }, + { + "epoch": 0.26808985974928634, + "grad_norm": 0.13772198796480103, + "learning_rate": 4.948519776318694e-06, + "loss": 0.775, + "step": 540 + }, + { + "epoch": 0.2685863224525257, + "grad_norm": 0.1474365567813288, + "learning_rate": 4.948322270712036e-06, + "loss": 0.775, + "step": 541 + }, + { + "epoch": 0.26908278515576517, + "grad_norm": 0.14740248795105346, + "learning_rate": 4.948124390918574e-06, + "loss": 0.8823, + "step": 542 + }, + { + "epoch": 0.2695792478590046, + "grad_norm": 0.13090799079120452, + "learning_rate": 4.947926136968551e-06, + "loss": 0.7722, + "step": 543 + }, + { + "epoch": 0.270075710562244, + "grad_norm": 0.1398513276367715, + "learning_rate": 4.947727508892268e-06, + "loss": 0.7622, + "step": 544 + }, + { + "epoch": 0.27057217326548344, + "grad_norm": 0.1402504056708596, + "learning_rate": 4.947528506720082e-06, + "loss": 0.7855, + "step": 545 + }, + { + "epoch": 0.2710686359687228, + "grad_norm": 0.13124650948584116, + "learning_rate": 4.947329130482407e-06, + "loss": 0.738, + "step": 546 + }, + { + "epoch": 0.27156509867196227, + "grad_norm": 0.13017696798038783, + "learning_rate": 4.947129380209713e-06, + "loss": 0.7761, + "step": 547 + }, + { + "epoch": 0.2720615613752017, + "grad_norm": 0.14381395142272502, + "learning_rate": 4.9469292559325316e-06, + "loss": 0.7887, + "step": 548 + }, + { + "epoch": 0.2725580240784411, + "grad_norm": 0.13338140216734587, + "learning_rate": 4.946728757681446e-06, + "loss": 0.8003, + "step": 549 + }, + { + "epoch": 0.27305448678168054, + "grad_norm": 0.13560694970854092, + "learning_rate": 4.946527885487101e-06, + "loss": 0.7377, + "step": 550 + }, + { + "epoch": 0.2735509494849199, + "grad_norm": 0.13758839128075348, + "learning_rate": 4.946326639380194e-06, + "loss": 0.8017, + "step": 551 + }, + { + "epoch": 0.27404741218815937, + "grad_norm": 0.13753618486867644, + "learning_rate": 4.946125019391486e-06, + "loss": 0.7786, + "step": 552 + }, + { + "epoch": 0.2745438748913988, + "grad_norm": 0.13936503272255338, + "learning_rate": 4.945923025551789e-06, + "loss": 0.7771, + "step": 553 + }, + { + "epoch": 0.2750403375946382, + "grad_norm": 0.13630518143436557, + "learning_rate": 4.945720657891975e-06, + "loss": 0.7896, + "step": 554 + }, + { + "epoch": 0.27553680029787764, + "grad_norm": 0.13452520454118158, + "learning_rate": 4.945517916442971e-06, + "loss": 0.82, + "step": 555 + }, + { + "epoch": 0.276033263001117, + "grad_norm": 0.15078185753975815, + "learning_rate": 4.945314801235766e-06, + "loss": 0.7977, + "step": 556 + }, + { + "epoch": 0.27652972570435647, + "grad_norm": 0.13571286379008968, + "learning_rate": 4.9451113123014e-06, + "loss": 0.8183, + "step": 557 + }, + { + "epoch": 0.27702618840759585, + "grad_norm": 0.14134176469144435, + "learning_rate": 4.9449074496709756e-06, + "loss": 0.8391, + "step": 558 + }, + { + "epoch": 0.2775226511108353, + "grad_norm": 0.13575461049209747, + "learning_rate": 4.944703213375648e-06, + "loss": 0.7913, + "step": 559 + }, + { + "epoch": 0.27801911381407474, + "grad_norm": 0.13627699697738124, + "learning_rate": 4.944498603446633e-06, + "loss": 0.7737, + "step": 560 + }, + { + "epoch": 0.2785155765173141, + "grad_norm": 0.1436068512980607, + "learning_rate": 4.9442936199152e-06, + "loss": 0.7665, + "step": 561 + }, + { + "epoch": 0.27901203922055356, + "grad_norm": 0.1419116978490336, + "learning_rate": 4.944088262812679e-06, + "loss": 0.8012, + "step": 562 + }, + { + "epoch": 0.27950850192379295, + "grad_norm": 0.14345533439447763, + "learning_rate": 4.943882532170454e-06, + "loss": 0.7487, + "step": 563 + }, + { + "epoch": 0.2800049646270324, + "grad_norm": 0.1482619360446857, + "learning_rate": 4.94367642801997e-06, + "loss": 0.7929, + "step": 564 + }, + { + "epoch": 0.28050142733027184, + "grad_norm": 0.1379616939658588, + "learning_rate": 4.943469950392724e-06, + "loss": 0.7792, + "step": 565 + }, + { + "epoch": 0.2809978900335112, + "grad_norm": 0.14304974445218507, + "learning_rate": 4.943263099320275e-06, + "loss": 0.7879, + "step": 566 + }, + { + "epoch": 0.28149435273675066, + "grad_norm": 0.1428969404097643, + "learning_rate": 4.943055874834236e-06, + "loss": 0.7217, + "step": 567 + }, + { + "epoch": 0.28199081543999005, + "grad_norm": 0.13838491143778672, + "learning_rate": 4.942848276966278e-06, + "loss": 0.7751, + "step": 568 + }, + { + "epoch": 0.2824872781432295, + "grad_norm": 0.13637716635230512, + "learning_rate": 4.942640305748128e-06, + "loss": 0.7978, + "step": 569 + }, + { + "epoch": 0.28298374084646893, + "grad_norm": 0.14002084365223352, + "learning_rate": 4.942431961211573e-06, + "loss": 0.8041, + "step": 570 + }, + { + "epoch": 0.2834802035497083, + "grad_norm": 0.13976480972292757, + "learning_rate": 4.942223243388454e-06, + "loss": 0.801, + "step": 571 + }, + { + "epoch": 0.28397666625294776, + "grad_norm": 0.14031361844161488, + "learning_rate": 4.9420141523106705e-06, + "loss": 0.7837, + "step": 572 + }, + { + "epoch": 0.28447312895618715, + "grad_norm": 0.14188180474957363, + "learning_rate": 4.941804688010178e-06, + "loss": 0.8129, + "step": 573 + }, + { + "epoch": 0.2849695916594266, + "grad_norm": 0.13739560633335732, + "learning_rate": 4.941594850518991e-06, + "loss": 0.8599, + "step": 574 + }, + { + "epoch": 0.285466054362666, + "grad_norm": 0.13161274663378267, + "learning_rate": 4.9413846398691775e-06, + "loss": 0.7992, + "step": 575 + }, + { + "epoch": 0.2859625170659054, + "grad_norm": 0.13571724186099784, + "learning_rate": 4.941174056092868e-06, + "loss": 0.7426, + "step": 576 + }, + { + "epoch": 0.28645897976914486, + "grad_norm": 0.14657992209581697, + "learning_rate": 4.940963099222244e-06, + "loss": 0.7474, + "step": 577 + }, + { + "epoch": 0.28695544247238425, + "grad_norm": 0.14054315894814753, + "learning_rate": 4.94075176928955e-06, + "loss": 0.8116, + "step": 578 + }, + { + "epoch": 0.2874519051756237, + "grad_norm": 0.14667038794719667, + "learning_rate": 4.940540066327082e-06, + "loss": 0.839, + "step": 579 + }, + { + "epoch": 0.2879483678788631, + "grad_norm": 0.13180298446137564, + "learning_rate": 4.940327990367196e-06, + "loss": 0.7865, + "step": 580 + }, + { + "epoch": 0.2884448305821025, + "grad_norm": 0.14864936755923036, + "learning_rate": 4.940115541442303e-06, + "loss": 0.7855, + "step": 581 + }, + { + "epoch": 0.28894129328534196, + "grad_norm": 0.13838905353793282, + "learning_rate": 4.939902719584875e-06, + "loss": 0.7736, + "step": 582 + }, + { + "epoch": 0.28943775598858135, + "grad_norm": 0.13826980328271227, + "learning_rate": 4.939689524827436e-06, + "loss": 0.7738, + "step": 583 + }, + { + "epoch": 0.2899342186918208, + "grad_norm": 0.13562760080033687, + "learning_rate": 4.939475957202572e-06, + "loss": 0.7803, + "step": 584 + }, + { + "epoch": 0.2904306813950602, + "grad_norm": 0.1477129197436419, + "learning_rate": 4.939262016742921e-06, + "loss": 0.8892, + "step": 585 + }, + { + "epoch": 0.2909271440982996, + "grad_norm": 0.13608868262673587, + "learning_rate": 4.939047703481182e-06, + "loss": 0.7787, + "step": 586 + }, + { + "epoch": 0.29142360680153906, + "grad_norm": 0.13532164943155106, + "learning_rate": 4.938833017450108e-06, + "loss": 0.7919, + "step": 587 + }, + { + "epoch": 0.29192006950477845, + "grad_norm": 0.1354528176328873, + "learning_rate": 4.938617958682511e-06, + "loss": 0.7187, + "step": 588 + }, + { + "epoch": 0.2924165322080179, + "grad_norm": 0.14287654528543972, + "learning_rate": 4.93840252721126e-06, + "loss": 0.7962, + "step": 589 + }, + { + "epoch": 0.2929129949112573, + "grad_norm": 0.13500520944212716, + "learning_rate": 4.9381867230692795e-06, + "loss": 0.7953, + "step": 590 + }, + { + "epoch": 0.2934094576144967, + "grad_norm": 0.13701507538430724, + "learning_rate": 4.937970546289551e-06, + "loss": 0.8274, + "step": 591 + }, + { + "epoch": 0.2939059203177361, + "grad_norm": 0.13546590818001467, + "learning_rate": 4.937753996905115e-06, + "loss": 0.7731, + "step": 592 + }, + { + "epoch": 0.29440238302097554, + "grad_norm": 0.13811700535991556, + "learning_rate": 4.937537074949067e-06, + "loss": 0.7647, + "step": 593 + }, + { + "epoch": 0.294898845724215, + "grad_norm": 0.12945879834966295, + "learning_rate": 4.937319780454559e-06, + "loss": 0.744, + "step": 594 + }, + { + "epoch": 0.2953953084274544, + "grad_norm": 0.13498731326365385, + "learning_rate": 4.937102113454803e-06, + "loss": 0.7759, + "step": 595 + }, + { + "epoch": 0.2958917711306938, + "grad_norm": 0.1292963266491359, + "learning_rate": 4.936884073983065e-06, + "loss": 0.7647, + "step": 596 + }, + { + "epoch": 0.2963882338339332, + "grad_norm": 0.1364683596899809, + "learning_rate": 4.9366656620726685e-06, + "loss": 0.782, + "step": 597 + }, + { + "epoch": 0.29688469653717264, + "grad_norm": 0.14354575937871145, + "learning_rate": 4.936446877756994e-06, + "loss": 0.7965, + "step": 598 + }, + { + "epoch": 0.2973811592404121, + "grad_norm": 0.14297046663924878, + "learning_rate": 4.936227721069481e-06, + "loss": 0.8565, + "step": 599 + }, + { + "epoch": 0.29787762194365147, + "grad_norm": 0.13515162331475536, + "learning_rate": 4.936008192043621e-06, + "loss": 0.7897, + "step": 600 + }, + { + "epoch": 0.2983740846468909, + "grad_norm": 0.14366874023945117, + "learning_rate": 4.935788290712969e-06, + "loss": 0.8526, + "step": 601 + }, + { + "epoch": 0.2988705473501303, + "grad_norm": 0.13501986218993212, + "learning_rate": 4.935568017111131e-06, + "loss": 0.7843, + "step": 602 + }, + { + "epoch": 0.29936701005336974, + "grad_norm": 0.13501818290266443, + "learning_rate": 4.935347371271772e-06, + "loss": 0.785, + "step": 603 + }, + { + "epoch": 0.2998634727566092, + "grad_norm": 0.1357480400677297, + "learning_rate": 4.9351263532286165e-06, + "loss": 0.809, + "step": 604 + }, + { + "epoch": 0.30035993545984857, + "grad_norm": 0.13963197307606626, + "learning_rate": 4.934904963015442e-06, + "loss": 0.7573, + "step": 605 + }, + { + "epoch": 0.300856398163088, + "grad_norm": 0.13904388544231225, + "learning_rate": 4.934683200666084e-06, + "loss": 0.76, + "step": 606 + }, + { + "epoch": 0.3013528608663274, + "grad_norm": 0.13273586015038175, + "learning_rate": 4.934461066214436e-06, + "loss": 0.777, + "step": 607 + }, + { + "epoch": 0.30184932356956684, + "grad_norm": 0.1410913256356068, + "learning_rate": 4.934238559694448e-06, + "loss": 0.8199, + "step": 608 + }, + { + "epoch": 0.3023457862728063, + "grad_norm": 0.13415630992181152, + "learning_rate": 4.9340156811401265e-06, + "loss": 0.8143, + "step": 609 + }, + { + "epoch": 0.30284224897604567, + "grad_norm": 0.1350060544192735, + "learning_rate": 4.9337924305855335e-06, + "loss": 0.7607, + "step": 610 + }, + { + "epoch": 0.3033387116792851, + "grad_norm": 0.13671649355566554, + "learning_rate": 4.933568808064791e-06, + "loss": 0.8062, + "step": 611 + }, + { + "epoch": 0.3038351743825245, + "grad_norm": 0.13241711189350516, + "learning_rate": 4.933344813612076e-06, + "loss": 0.7463, + "step": 612 + }, + { + "epoch": 0.30433163708576394, + "grad_norm": 0.145086762190584, + "learning_rate": 4.933120447261621e-06, + "loss": 0.8496, + "step": 613 + }, + { + "epoch": 0.3048280997890033, + "grad_norm": 0.13576833005384956, + "learning_rate": 4.932895709047719e-06, + "loss": 0.7619, + "step": 614 + }, + { + "epoch": 0.30532456249224277, + "grad_norm": 0.13080739610003392, + "learning_rate": 4.932670599004715e-06, + "loss": 0.7436, + "step": 615 + }, + { + "epoch": 0.3058210251954822, + "grad_norm": 0.13931345500666475, + "learning_rate": 4.932445117167016e-06, + "loss": 0.7806, + "step": 616 + }, + { + "epoch": 0.3063174878987216, + "grad_norm": 0.1403456162971794, + "learning_rate": 4.932219263569082e-06, + "loss": 0.8203, + "step": 617 + }, + { + "epoch": 0.30681395060196104, + "grad_norm": 0.14545314884277688, + "learning_rate": 4.93199303824543e-06, + "loss": 0.7991, + "step": 618 + }, + { + "epoch": 0.3073104133052004, + "grad_norm": 0.13361188513192082, + "learning_rate": 4.931766441230637e-06, + "loss": 0.7669, + "step": 619 + }, + { + "epoch": 0.30780687600843987, + "grad_norm": 0.14211532439526364, + "learning_rate": 4.931539472559335e-06, + "loss": 0.8197, + "step": 620 + }, + { + "epoch": 0.3083033387116793, + "grad_norm": 0.1384426265495238, + "learning_rate": 4.93131213226621e-06, + "loss": 0.7982, + "step": 621 + }, + { + "epoch": 0.3087998014149187, + "grad_norm": 0.13479886625937776, + "learning_rate": 4.931084420386009e-06, + "loss": 0.7696, + "step": 622 + }, + { + "epoch": 0.30929626411815814, + "grad_norm": 0.13591593582520675, + "learning_rate": 4.9308563369535335e-06, + "loss": 0.8021, + "step": 623 + }, + { + "epoch": 0.3097927268213975, + "grad_norm": 0.13924380730728358, + "learning_rate": 4.930627882003644e-06, + "loss": 0.8394, + "step": 624 + }, + { + "epoch": 0.31028918952463697, + "grad_norm": 0.13692960315786307, + "learning_rate": 4.930399055571253e-06, + "loss": 0.7905, + "step": 625 + }, + { + "epoch": 0.3107856522278764, + "grad_norm": 0.1339979894901406, + "learning_rate": 4.930169857691336e-06, + "loss": 0.7881, + "step": 626 + }, + { + "epoch": 0.3112821149311158, + "grad_norm": 0.1367431893133873, + "learning_rate": 4.929940288398921e-06, + "loss": 0.7887, + "step": 627 + }, + { + "epoch": 0.31177857763435524, + "grad_norm": 0.13841200775795232, + "learning_rate": 4.929710347729094e-06, + "loss": 0.7653, + "step": 628 + }, + { + "epoch": 0.3122750403375946, + "grad_norm": 0.13600960600331674, + "learning_rate": 4.929480035716997e-06, + "loss": 0.7811, + "step": 629 + }, + { + "epoch": 0.31277150304083406, + "grad_norm": 0.13653488024445395, + "learning_rate": 4.9292493523978315e-06, + "loss": 0.7919, + "step": 630 + }, + { + "epoch": 0.31326796574407345, + "grad_norm": 0.13176835024293457, + "learning_rate": 4.929018297806852e-06, + "loss": 0.7634, + "step": 631 + }, + { + "epoch": 0.3137644284473129, + "grad_norm": 0.1469495293000222, + "learning_rate": 4.928786871979372e-06, + "loss": 0.8497, + "step": 632 + }, + { + "epoch": 0.31426089115055234, + "grad_norm": 0.1431964358331162, + "learning_rate": 4.928555074950761e-06, + "loss": 0.8058, + "step": 633 + }, + { + "epoch": 0.3147573538537917, + "grad_norm": 0.13850195286656924, + "learning_rate": 4.928322906756446e-06, + "loss": 0.7735, + "step": 634 + }, + { + "epoch": 0.31525381655703116, + "grad_norm": 0.13463773472502238, + "learning_rate": 4.92809036743191e-06, + "loss": 0.7184, + "step": 635 + }, + { + "epoch": 0.31575027926027055, + "grad_norm": 0.1364605714313478, + "learning_rate": 4.927857457012693e-06, + "loss": 0.7585, + "step": 636 + }, + { + "epoch": 0.31624674196351, + "grad_norm": 0.1366334427972066, + "learning_rate": 4.927624175534391e-06, + "loss": 0.7739, + "step": 637 + }, + { + "epoch": 0.31674320466674943, + "grad_norm": 0.13226666489304473, + "learning_rate": 4.927390523032658e-06, + "loss": 0.7801, + "step": 638 + }, + { + "epoch": 0.3172396673699888, + "grad_norm": 0.1321397495808711, + "learning_rate": 4.927156499543203e-06, + "loss": 0.7691, + "step": 639 + }, + { + "epoch": 0.31773613007322826, + "grad_norm": 0.14417418984304473, + "learning_rate": 4.926922105101795e-06, + "loss": 0.8335, + "step": 640 + }, + { + "epoch": 0.31823259277646765, + "grad_norm": 0.1373833193656603, + "learning_rate": 4.926687339744255e-06, + "loss": 0.8057, + "step": 641 + }, + { + "epoch": 0.3187290554797071, + "grad_norm": 0.13780220647272026, + "learning_rate": 4.926452203506464e-06, + "loss": 0.8188, + "step": 642 + }, + { + "epoch": 0.31922551818294653, + "grad_norm": 0.1304815912554088, + "learning_rate": 4.926216696424359e-06, + "loss": 0.7549, + "step": 643 + }, + { + "epoch": 0.3197219808861859, + "grad_norm": 0.14230394590112047, + "learning_rate": 4.9259808185339344e-06, + "loss": 0.8688, + "step": 644 + }, + { + "epoch": 0.32021844358942536, + "grad_norm": 0.13197144711432499, + "learning_rate": 4.925744569871238e-06, + "loss": 0.7638, + "step": 645 + }, + { + "epoch": 0.32071490629266475, + "grad_norm": 0.1377381360289958, + "learning_rate": 4.925507950472378e-06, + "loss": 0.7865, + "step": 646 + }, + { + "epoch": 0.3212113689959042, + "grad_norm": 0.14064672696348562, + "learning_rate": 4.9252709603735184e-06, + "loss": 0.8655, + "step": 647 + }, + { + "epoch": 0.3217078316991436, + "grad_norm": 0.17099346992203562, + "learning_rate": 4.925033599610879e-06, + "loss": 0.7557, + "step": 648 + }, + { + "epoch": 0.322204294402383, + "grad_norm": 0.13649366857460205, + "learning_rate": 4.9247958682207365e-06, + "loss": 0.8075, + "step": 649 + }, + { + "epoch": 0.32270075710562246, + "grad_norm": 0.13376375859315848, + "learning_rate": 4.924557766239424e-06, + "loss": 0.8125, + "step": 650 + }, + { + "epoch": 0.32319721980886185, + "grad_norm": 0.1395328981536072, + "learning_rate": 4.9243192937033304e-06, + "loss": 0.7883, + "step": 651 + }, + { + "epoch": 0.3236936825121013, + "grad_norm": 0.13819391735551326, + "learning_rate": 4.924080450648905e-06, + "loss": 0.8368, + "step": 652 + }, + { + "epoch": 0.3241901452153407, + "grad_norm": 0.13868224090210812, + "learning_rate": 4.92384123711265e-06, + "loss": 0.7756, + "step": 653 + }, + { + "epoch": 0.3246866079185801, + "grad_norm": 0.13995057823515514, + "learning_rate": 4.923601653131125e-06, + "loss": 0.7953, + "step": 654 + }, + { + "epoch": 0.32518307062181956, + "grad_norm": 0.14384219577990262, + "learning_rate": 4.923361698740946e-06, + "loss": 0.7864, + "step": 655 + }, + { + "epoch": 0.32567953332505895, + "grad_norm": 0.1359832308842288, + "learning_rate": 4.923121373978789e-06, + "loss": 0.7315, + "step": 656 + }, + { + "epoch": 0.3261759960282984, + "grad_norm": 0.1332816844154388, + "learning_rate": 4.92288067888138e-06, + "loss": 0.771, + "step": 657 + }, + { + "epoch": 0.3266724587315378, + "grad_norm": 0.13971086373297162, + "learning_rate": 4.922639613485508e-06, + "loss": 0.7904, + "step": 658 + }, + { + "epoch": 0.3271689214347772, + "grad_norm": 0.13132115501434735, + "learning_rate": 4.922398177828015e-06, + "loss": 0.7416, + "step": 659 + }, + { + "epoch": 0.32766538413801666, + "grad_norm": 0.13992347458321633, + "learning_rate": 4.9221563719458e-06, + "loss": 0.814, + "step": 660 + }, + { + "epoch": 0.32816184684125604, + "grad_norm": 0.13633398687640888, + "learning_rate": 4.921914195875821e-06, + "loss": 0.7611, + "step": 661 + }, + { + "epoch": 0.3286583095444955, + "grad_norm": 0.14882463890187234, + "learning_rate": 4.921671649655088e-06, + "loss": 0.7434, + "step": 662 + }, + { + "epoch": 0.3291547722477349, + "grad_norm": 0.13772949607044616, + "learning_rate": 4.921428733320674e-06, + "loss": 0.7717, + "step": 663 + }, + { + "epoch": 0.3296512349509743, + "grad_norm": 0.1414530607047641, + "learning_rate": 4.921185446909702e-06, + "loss": 0.782, + "step": 664 + }, + { + "epoch": 0.3301476976542137, + "grad_norm": 0.14471648591939731, + "learning_rate": 4.920941790459355e-06, + "loss": 0.7662, + "step": 665 + }, + { + "epoch": 0.33064416035745314, + "grad_norm": 0.14485133167971112, + "learning_rate": 4.920697764006872e-06, + "loss": 0.8213, + "step": 666 + }, + { + "epoch": 0.3311406230606926, + "grad_norm": 0.14192892706466312, + "learning_rate": 4.920453367589548e-06, + "loss": 0.8613, + "step": 667 + }, + { + "epoch": 0.33163708576393197, + "grad_norm": 0.14207950507096373, + "learning_rate": 4.920208601244737e-06, + "loss": 0.826, + "step": 668 + }, + { + "epoch": 0.3321335484671714, + "grad_norm": 0.14408452937358693, + "learning_rate": 4.919963465009846e-06, + "loss": 0.7818, + "step": 669 + }, + { + "epoch": 0.3326300111704108, + "grad_norm": 0.13736006931932224, + "learning_rate": 4.919717958922341e-06, + "loss": 0.7589, + "step": 670 + }, + { + "epoch": 0.33312647387365024, + "grad_norm": 0.14298168063992314, + "learning_rate": 4.919472083019743e-06, + "loss": 0.8034, + "step": 671 + }, + { + "epoch": 0.3336229365768897, + "grad_norm": 0.1376542857034229, + "learning_rate": 4.91922583733963e-06, + "loss": 0.7863, + "step": 672 + }, + { + "epoch": 0.33411939928012907, + "grad_norm": 0.1390010637369658, + "learning_rate": 4.918979221919637e-06, + "loss": 0.842, + "step": 673 + }, + { + "epoch": 0.3346158619833685, + "grad_norm": 0.14484040584209332, + "learning_rate": 4.918732236797456e-06, + "loss": 0.7629, + "step": 674 + }, + { + "epoch": 0.3351123246866079, + "grad_norm": 0.13890387799381465, + "learning_rate": 4.918484882010833e-06, + "loss": 0.778, + "step": 675 + }, + { + "epoch": 0.33560878738984734, + "grad_norm": 0.13863969701935344, + "learning_rate": 4.918237157597574e-06, + "loss": 0.8279, + "step": 676 + }, + { + "epoch": 0.3361052500930868, + "grad_norm": 0.1426640315283534, + "learning_rate": 4.917989063595539e-06, + "loss": 0.8762, + "step": 677 + }, + { + "epoch": 0.33660171279632617, + "grad_norm": 0.13500863262637763, + "learning_rate": 4.917740600042645e-06, + "loss": 0.7536, + "step": 678 + }, + { + "epoch": 0.3370981754995656, + "grad_norm": 0.12795948317244496, + "learning_rate": 4.917491766976865e-06, + "loss": 0.7169, + "step": 679 + }, + { + "epoch": 0.337594638202805, + "grad_norm": 0.13710750398142116, + "learning_rate": 4.917242564436231e-06, + "loss": 0.738, + "step": 680 + }, + { + "epoch": 0.33809110090604444, + "grad_norm": 0.13190657220996582, + "learning_rate": 4.916992992458828e-06, + "loss": 0.7693, + "step": 681 + }, + { + "epoch": 0.3385875636092838, + "grad_norm": 0.13830630412972159, + "learning_rate": 4.9167430510828e-06, + "loss": 0.8153, + "step": 682 + }, + { + "epoch": 0.33908402631252327, + "grad_norm": 0.13304884196362807, + "learning_rate": 4.916492740346346e-06, + "loss": 0.7945, + "step": 683 + }, + { + "epoch": 0.3395804890157627, + "grad_norm": 0.12774428215124484, + "learning_rate": 4.916242060287723e-06, + "loss": 0.7052, + "step": 684 + }, + { + "epoch": 0.3400769517190021, + "grad_norm": 0.1265371082556476, + "learning_rate": 4.9159910109452416e-06, + "loss": 0.7483, + "step": 685 + }, + { + "epoch": 0.34057341442224154, + "grad_norm": 0.13343444419954362, + "learning_rate": 4.9157395923572716e-06, + "loss": 0.7653, + "step": 686 + }, + { + "epoch": 0.3410698771254809, + "grad_norm": 0.13166700692340655, + "learning_rate": 4.9154878045622385e-06, + "loss": 0.7587, + "step": 687 + }, + { + "epoch": 0.34156633982872037, + "grad_norm": 0.14076974443956647, + "learning_rate": 4.915235647598624e-06, + "loss": 0.7912, + "step": 688 + }, + { + "epoch": 0.3420628025319598, + "grad_norm": 0.12977776060064555, + "learning_rate": 4.914983121504966e-06, + "loss": 0.7723, + "step": 689 + }, + { + "epoch": 0.3425592652351992, + "grad_norm": 0.13812993400540244, + "learning_rate": 4.914730226319859e-06, + "loss": 0.7968, + "step": 690 + }, + { + "epoch": 0.34305572793843864, + "grad_norm": 0.14598669976661624, + "learning_rate": 4.914476962081954e-06, + "loss": 0.8009, + "step": 691 + }, + { + "epoch": 0.343552190641678, + "grad_norm": 0.13314085383077542, + "learning_rate": 4.9142233288299595e-06, + "loss": 0.8293, + "step": 692 + }, + { + "epoch": 0.34404865334491747, + "grad_norm": 0.14232577807503202, + "learning_rate": 4.9139693266026375e-06, + "loss": 0.7755, + "step": 693 + }, + { + "epoch": 0.3445451160481569, + "grad_norm": 0.13552034485159273, + "learning_rate": 4.91371495543881e-06, + "loss": 0.8046, + "step": 694 + }, + { + "epoch": 0.3450415787513963, + "grad_norm": 0.13920010636352773, + "learning_rate": 4.913460215377351e-06, + "loss": 0.7898, + "step": 695 + }, + { + "epoch": 0.34553804145463574, + "grad_norm": 0.1365082779225381, + "learning_rate": 4.9132051064571965e-06, + "loss": 0.7949, + "step": 696 + }, + { + "epoch": 0.3460345041578751, + "grad_norm": 0.13329039576470306, + "learning_rate": 4.912949628717334e-06, + "loss": 0.7691, + "step": 697 + }, + { + "epoch": 0.34653096686111456, + "grad_norm": 0.13821152204427162, + "learning_rate": 4.912693782196808e-06, + "loss": 0.782, + "step": 698 + }, + { + "epoch": 0.34702742956435395, + "grad_norm": 0.1351784114090911, + "learning_rate": 4.912437566934724e-06, + "loss": 0.7467, + "step": 699 + }, + { + "epoch": 0.3475238922675934, + "grad_norm": 0.13348674748078376, + "learning_rate": 4.912180982970237e-06, + "loss": 0.7762, + "step": 700 + }, + { + "epoch": 0.34802035497083283, + "grad_norm": 0.12985538825056495, + "learning_rate": 4.911924030342563e-06, + "loss": 0.7902, + "step": 701 + }, + { + "epoch": 0.3485168176740722, + "grad_norm": 0.1368260578507066, + "learning_rate": 4.911666709090974e-06, + "loss": 0.7878, + "step": 702 + }, + { + "epoch": 0.34901328037731166, + "grad_norm": 0.1360969813881036, + "learning_rate": 4.911409019254797e-06, + "loss": 0.7653, + "step": 703 + }, + { + "epoch": 0.34950974308055105, + "grad_norm": 0.13601859820292503, + "learning_rate": 4.911150960873414e-06, + "loss": 0.7416, + "step": 704 + }, + { + "epoch": 0.3500062057837905, + "grad_norm": 0.13523409685451968, + "learning_rate": 4.910892533986268e-06, + "loss": 0.7683, + "step": 705 + }, + { + "epoch": 0.35050266848702993, + "grad_norm": 0.1307299504782071, + "learning_rate": 4.9106337386328524e-06, + "loss": 0.7706, + "step": 706 + }, + { + "epoch": 0.3509991311902693, + "grad_norm": 0.1324922818188745, + "learning_rate": 4.910374574852722e-06, + "loss": 0.7753, + "step": 707 + }, + { + "epoch": 0.35149559389350876, + "grad_norm": 0.13283375926300728, + "learning_rate": 4.910115042685486e-06, + "loss": 0.7988, + "step": 708 + }, + { + "epoch": 0.35199205659674815, + "grad_norm": 0.1364833655942737, + "learning_rate": 4.909855142170809e-06, + "loss": 0.7662, + "step": 709 + }, + { + "epoch": 0.3524885192999876, + "grad_norm": 0.13240082659711724, + "learning_rate": 4.909594873348412e-06, + "loss": 0.7692, + "step": 710 + }, + { + "epoch": 0.35298498200322703, + "grad_norm": 0.13859318485445407, + "learning_rate": 4.909334236258073e-06, + "loss": 0.8092, + "step": 711 + }, + { + "epoch": 0.3534814447064664, + "grad_norm": 0.14669171230394829, + "learning_rate": 4.909073230939628e-06, + "loss": 0.849, + "step": 712 + }, + { + "epoch": 0.35397790740970586, + "grad_norm": 0.1329208698752514, + "learning_rate": 4.908811857432966e-06, + "loss": 0.8112, + "step": 713 + }, + { + "epoch": 0.35447437011294525, + "grad_norm": 0.13420771737338105, + "learning_rate": 4.908550115778032e-06, + "loss": 0.8081, + "step": 714 + }, + { + "epoch": 0.3549708328161847, + "grad_norm": 0.13869844814379267, + "learning_rate": 4.908288006014833e-06, + "loss": 0.7924, + "step": 715 + }, + { + "epoch": 0.3554672955194241, + "grad_norm": 0.13159030247433556, + "learning_rate": 4.9080255281834255e-06, + "loss": 0.7896, + "step": 716 + }, + { + "epoch": 0.3559637582226635, + "grad_norm": 0.12782379383553066, + "learning_rate": 4.907762682323926e-06, + "loss": 0.7492, + "step": 717 + }, + { + "epoch": 0.35646022092590296, + "grad_norm": 0.12984107500991426, + "learning_rate": 4.907499468476506e-06, + "loss": 0.7713, + "step": 718 + }, + { + "epoch": 0.35695668362914235, + "grad_norm": 0.13563855815235779, + "learning_rate": 4.907235886681394e-06, + "loss": 0.7198, + "step": 719 + }, + { + "epoch": 0.3574531463323818, + "grad_norm": 0.1410105313395293, + "learning_rate": 4.906971936978874e-06, + "loss": 0.7584, + "step": 720 + }, + { + "epoch": 0.3579496090356212, + "grad_norm": 0.13491737996113276, + "learning_rate": 4.906707619409285e-06, + "loss": 0.8048, + "step": 721 + }, + { + "epoch": 0.3584460717388606, + "grad_norm": 0.15901372063337899, + "learning_rate": 4.906442934013026e-06, + "loss": 0.8157, + "step": 722 + }, + { + "epoch": 0.35894253444210006, + "grad_norm": 0.1318933579561297, + "learning_rate": 4.906177880830548e-06, + "loss": 0.7624, + "step": 723 + }, + { + "epoch": 0.35943899714533945, + "grad_norm": 0.13581410637368269, + "learning_rate": 4.905912459902362e-06, + "loss": 0.8205, + "step": 724 + }, + { + "epoch": 0.3599354598485789, + "grad_norm": 0.1370308459267444, + "learning_rate": 4.905646671269032e-06, + "loss": 0.8298, + "step": 725 + }, + { + "epoch": 0.3604319225518183, + "grad_norm": 0.1358006289221788, + "learning_rate": 4.90538051497118e-06, + "loss": 0.8233, + "step": 726 + }, + { + "epoch": 0.3609283852550577, + "grad_norm": 0.1316316195921008, + "learning_rate": 4.905113991049484e-06, + "loss": 0.8134, + "step": 727 + }, + { + "epoch": 0.36142484795829716, + "grad_norm": 0.1335957967976374, + "learning_rate": 4.904847099544676e-06, + "loss": 0.7753, + "step": 728 + }, + { + "epoch": 0.36192131066153654, + "grad_norm": 0.13467968539337932, + "learning_rate": 4.904579840497549e-06, + "loss": 0.7678, + "step": 729 + }, + { + "epoch": 0.362417773364776, + "grad_norm": 0.13328087348761786, + "learning_rate": 4.904312213948948e-06, + "loss": 0.7422, + "step": 730 + }, + { + "epoch": 0.3629142360680154, + "grad_norm": 0.13791325862150497, + "learning_rate": 4.904044219939775e-06, + "loss": 0.81, + "step": 731 + }, + { + "epoch": 0.3634106987712548, + "grad_norm": 0.14664569154055215, + "learning_rate": 4.9037758585109886e-06, + "loss": 0.7829, + "step": 732 + }, + { + "epoch": 0.36390716147449426, + "grad_norm": 0.13819662122726, + "learning_rate": 4.9035071297036045e-06, + "loss": 0.7533, + "step": 733 + }, + { + "epoch": 0.36440362417773364, + "grad_norm": 0.14379447561879494, + "learning_rate": 4.903238033558692e-06, + "loss": 0.7812, + "step": 734 + }, + { + "epoch": 0.3649000868809731, + "grad_norm": 0.1348693038250325, + "learning_rate": 4.90296857011738e-06, + "loss": 0.7406, + "step": 735 + }, + { + "epoch": 0.36539654958421247, + "grad_norm": 0.13919538306325066, + "learning_rate": 4.90269873942085e-06, + "loss": 0.7716, + "step": 736 + }, + { + "epoch": 0.3658930122874519, + "grad_norm": 0.13768912705501787, + "learning_rate": 4.902428541510342e-06, + "loss": 0.7797, + "step": 737 + }, + { + "epoch": 0.3663894749906913, + "grad_norm": 0.13652014063796708, + "learning_rate": 4.902157976427152e-06, + "loss": 0.7549, + "step": 738 + }, + { + "epoch": 0.36688593769393074, + "grad_norm": 0.13524592628689325, + "learning_rate": 4.901887044212631e-06, + "loss": 0.7481, + "step": 739 + }, + { + "epoch": 0.3673824003971702, + "grad_norm": 0.1311044839852732, + "learning_rate": 4.9016157449081855e-06, + "loss": 0.8021, + "step": 740 + }, + { + "epoch": 0.36787886310040957, + "grad_norm": 0.13954525535721593, + "learning_rate": 4.901344078555282e-06, + "loss": 0.7548, + "step": 741 + }, + { + "epoch": 0.368375325803649, + "grad_norm": 0.136268642834534, + "learning_rate": 4.901072045195437e-06, + "loss": 0.7602, + "step": 742 + }, + { + "epoch": 0.3688717885068884, + "grad_norm": 0.23363764465279754, + "learning_rate": 4.90079964487023e-06, + "loss": 0.776, + "step": 743 + }, + { + "epoch": 0.36936825121012784, + "grad_norm": 0.13274057394053615, + "learning_rate": 4.90052687762129e-06, + "loss": 0.7351, + "step": 744 + }, + { + "epoch": 0.3698647139133673, + "grad_norm": 0.14127484700099766, + "learning_rate": 4.900253743490307e-06, + "loss": 0.7798, + "step": 745 + }, + { + "epoch": 0.37036117661660667, + "grad_norm": 0.12815346443371342, + "learning_rate": 4.8999802425190235e-06, + "loss": 0.7075, + "step": 746 + }, + { + "epoch": 0.3708576393198461, + "grad_norm": 0.1368536898030887, + "learning_rate": 4.899706374749242e-06, + "loss": 0.757, + "step": 747 + }, + { + "epoch": 0.3713541020230855, + "grad_norm": 0.133763511827003, + "learning_rate": 4.899432140222816e-06, + "loss": 0.807, + "step": 748 + }, + { + "epoch": 0.37185056472632494, + "grad_norm": 0.14086274715481756, + "learning_rate": 4.899157538981661e-06, + "loss": 0.7545, + "step": 749 + }, + { + "epoch": 0.3723470274295644, + "grad_norm": 0.13940663877796536, + "learning_rate": 4.898882571067742e-06, + "loss": 0.7895, + "step": 750 + }, + { + "epoch": 0.37284349013280377, + "grad_norm": 0.14364436254867233, + "learning_rate": 4.898607236523086e-06, + "loss": 0.824, + "step": 751 + }, + { + "epoch": 0.3733399528360432, + "grad_norm": 0.1363424647021028, + "learning_rate": 4.898331535389772e-06, + "loss": 0.7689, + "step": 752 + }, + { + "epoch": 0.3738364155392826, + "grad_norm": 0.134333700914737, + "learning_rate": 4.898055467709938e-06, + "loss": 0.781, + "step": 753 + }, + { + "epoch": 0.37433287824252204, + "grad_norm": 0.1354244860930839, + "learning_rate": 4.897779033525775e-06, + "loss": 0.7425, + "step": 754 + }, + { + "epoch": 0.3748293409457614, + "grad_norm": 0.13641351699049295, + "learning_rate": 4.897502232879533e-06, + "loss": 0.8106, + "step": 755 + }, + { + "epoch": 0.37532580364900087, + "grad_norm": 0.1397668428766841, + "learning_rate": 4.897225065813515e-06, + "loss": 0.7725, + "step": 756 + }, + { + "epoch": 0.3758222663522403, + "grad_norm": 0.1388989852014346, + "learning_rate": 4.896947532370083e-06, + "loss": 0.8095, + "step": 757 + }, + { + "epoch": 0.3763187290554797, + "grad_norm": 0.13452943247011598, + "learning_rate": 4.896669632591652e-06, + "loss": 0.7768, + "step": 758 + }, + { + "epoch": 0.37681519175871914, + "grad_norm": 0.13898005901785496, + "learning_rate": 4.896391366520695e-06, + "loss": 0.8469, + "step": 759 + }, + { + "epoch": 0.3773116544619585, + "grad_norm": 0.1344241818366532, + "learning_rate": 4.8961127341997425e-06, + "loss": 0.7871, + "step": 760 + }, + { + "epoch": 0.37780811716519797, + "grad_norm": 0.13570943105691186, + "learning_rate": 4.895833735671376e-06, + "loss": 0.7702, + "step": 761 + }, + { + "epoch": 0.3783045798684374, + "grad_norm": 0.1341854190460997, + "learning_rate": 4.895554370978238e-06, + "loss": 0.7409, + "step": 762 + }, + { + "epoch": 0.3788010425716768, + "grad_norm": 0.13050283705876012, + "learning_rate": 4.895274640163023e-06, + "loss": 0.7793, + "step": 763 + }, + { + "epoch": 0.37929750527491624, + "grad_norm": 0.1329571983648645, + "learning_rate": 4.894994543268486e-06, + "loss": 0.7539, + "step": 764 + }, + { + "epoch": 0.3797939679781556, + "grad_norm": 0.14453559821774697, + "learning_rate": 4.894714080337433e-06, + "loss": 0.8057, + "step": 765 + }, + { + "epoch": 0.38029043068139506, + "grad_norm": 0.14541141571134905, + "learning_rate": 4.894433251412729e-06, + "loss": 0.8173, + "step": 766 + }, + { + "epoch": 0.3807868933846345, + "grad_norm": 0.1365214842205778, + "learning_rate": 4.894152056537295e-06, + "loss": 0.7515, + "step": 767 + }, + { + "epoch": 0.3812833560878739, + "grad_norm": 0.13104231373031291, + "learning_rate": 4.893870495754106e-06, + "loss": 0.7388, + "step": 768 + }, + { + "epoch": 0.38177981879111333, + "grad_norm": 0.1293760148802387, + "learning_rate": 4.8935885691061955e-06, + "loss": 0.7202, + "step": 769 + }, + { + "epoch": 0.3822762814943527, + "grad_norm": 0.1413055453494607, + "learning_rate": 4.893306276636649e-06, + "loss": 0.7529, + "step": 770 + }, + { + "epoch": 0.38277274419759216, + "grad_norm": 0.13778756966339015, + "learning_rate": 4.893023618388612e-06, + "loss": 0.7854, + "step": 771 + }, + { + "epoch": 0.38326920690083155, + "grad_norm": 0.12932306420294923, + "learning_rate": 4.892740594405285e-06, + "loss": 0.7832, + "step": 772 + }, + { + "epoch": 0.383765669604071, + "grad_norm": 0.13397702650716775, + "learning_rate": 4.892457204729923e-06, + "loss": 0.8013, + "step": 773 + }, + { + "epoch": 0.38426213230731043, + "grad_norm": 0.14230173874593646, + "learning_rate": 4.892173449405837e-06, + "loss": 0.749, + "step": 774 + }, + { + "epoch": 0.3847585950105498, + "grad_norm": 0.13494471486274012, + "learning_rate": 4.891889328476395e-06, + "loss": 0.7644, + "step": 775 + }, + { + "epoch": 0.38525505771378926, + "grad_norm": 0.13174879824024627, + "learning_rate": 4.89160484198502e-06, + "loss": 0.7659, + "step": 776 + }, + { + "epoch": 0.38575152041702865, + "grad_norm": 0.1340875139550934, + "learning_rate": 4.891319989975191e-06, + "loss": 0.8022, + "step": 777 + }, + { + "epoch": 0.3862479831202681, + "grad_norm": 0.1317344878554217, + "learning_rate": 4.891034772490444e-06, + "loss": 0.7821, + "step": 778 + }, + { + "epoch": 0.38674444582350753, + "grad_norm": 0.12863325352249577, + "learning_rate": 4.890749189574369e-06, + "loss": 0.729, + "step": 779 + }, + { + "epoch": 0.3872409085267469, + "grad_norm": 0.13603104401195087, + "learning_rate": 4.8904632412706135e-06, + "loss": 0.7692, + "step": 780 + }, + { + "epoch": 0.38773737122998636, + "grad_norm": 0.1371314105411586, + "learning_rate": 4.890176927622879e-06, + "loss": 0.816, + "step": 781 + }, + { + "epoch": 0.38823383393322575, + "grad_norm": 0.13666348082090357, + "learning_rate": 4.889890248674926e-06, + "loss": 0.7781, + "step": 782 + }, + { + "epoch": 0.3887302966364652, + "grad_norm": 0.12972574048556007, + "learning_rate": 4.889603204470566e-06, + "loss": 0.7418, + "step": 783 + }, + { + "epoch": 0.38922675933970463, + "grad_norm": 0.1380447797265697, + "learning_rate": 4.889315795053671e-06, + "loss": 0.7666, + "step": 784 + }, + { + "epoch": 0.389723222042944, + "grad_norm": 0.1302714912154436, + "learning_rate": 4.889028020468167e-06, + "loss": 0.783, + "step": 785 + }, + { + "epoch": 0.39021968474618346, + "grad_norm": 0.1467555997648217, + "learning_rate": 4.8887398807580345e-06, + "loss": 0.7373, + "step": 786 + }, + { + "epoch": 0.39071614744942285, + "grad_norm": 0.13456233284077948, + "learning_rate": 4.888451375967313e-06, + "loss": 0.7843, + "step": 787 + }, + { + "epoch": 0.3912126101526623, + "grad_norm": 0.13579468578021064, + "learning_rate": 4.888162506140093e-06, + "loss": 0.7926, + "step": 788 + }, + { + "epoch": 0.3917090728559017, + "grad_norm": 0.13486742182263928, + "learning_rate": 4.887873271320526e-06, + "loss": 0.7848, + "step": 789 + }, + { + "epoch": 0.3922055355591411, + "grad_norm": 0.13060505166884362, + "learning_rate": 4.887583671552815e-06, + "loss": 0.7316, + "step": 790 + }, + { + "epoch": 0.39270199826238056, + "grad_norm": 0.13869981464559716, + "learning_rate": 4.887293706881224e-06, + "loss": 0.8121, + "step": 791 + }, + { + "epoch": 0.39319846096561994, + "grad_norm": 0.13166122616659146, + "learning_rate": 4.887003377350066e-06, + "loss": 0.7251, + "step": 792 + }, + { + "epoch": 0.3936949236688594, + "grad_norm": 0.14046352512611365, + "learning_rate": 4.886712683003715e-06, + "loss": 0.758, + "step": 793 + }, + { + "epoch": 0.3941913863720988, + "grad_norm": 0.14464252163844996, + "learning_rate": 4.886421623886598e-06, + "loss": 0.815, + "step": 794 + }, + { + "epoch": 0.3946878490753382, + "grad_norm": 0.13409840602053655, + "learning_rate": 4.886130200043199e-06, + "loss": 0.7752, + "step": 795 + }, + { + "epoch": 0.39518431177857766, + "grad_norm": 0.13116167627946584, + "learning_rate": 4.885838411518058e-06, + "loss": 0.7308, + "step": 796 + }, + { + "epoch": 0.39568077448181704, + "grad_norm": 0.1366342943883782, + "learning_rate": 4.885546258355769e-06, + "loss": 0.8026, + "step": 797 + }, + { + "epoch": 0.3961772371850565, + "grad_norm": 0.13731477100332815, + "learning_rate": 4.885253740600985e-06, + "loss": 0.7204, + "step": 798 + }, + { + "epoch": 0.39667369988829587, + "grad_norm": 0.14004464166577948, + "learning_rate": 4.88496085829841e-06, + "loss": 0.7729, + "step": 799 + }, + { + "epoch": 0.3971701625915353, + "grad_norm": 0.13691778866799595, + "learning_rate": 4.884667611492808e-06, + "loss": 0.7497, + "step": 800 + }, + { + "epoch": 0.39766662529477476, + "grad_norm": 0.13321987767176327, + "learning_rate": 4.884374000228998e-06, + "loss": 0.7415, + "step": 801 + }, + { + "epoch": 0.39816308799801414, + "grad_norm": 0.1398138150913019, + "learning_rate": 4.884080024551851e-06, + "loss": 0.7733, + "step": 802 + }, + { + "epoch": 0.3986595507012536, + "grad_norm": 0.1470854981776522, + "learning_rate": 4.8837856845062994e-06, + "loss": 0.7704, + "step": 803 + }, + { + "epoch": 0.39915601340449297, + "grad_norm": 0.13800002748758905, + "learning_rate": 4.883490980137327e-06, + "loss": 0.781, + "step": 804 + }, + { + "epoch": 0.3996524761077324, + "grad_norm": 0.13129465917522473, + "learning_rate": 4.883195911489974e-06, + "loss": 0.7406, + "step": 805 + }, + { + "epoch": 0.4001489388109718, + "grad_norm": 0.13483055098605387, + "learning_rate": 4.882900478609338e-06, + "loss": 0.7982, + "step": 806 + }, + { + "epoch": 0.40064540151421124, + "grad_norm": 0.13355656690282036, + "learning_rate": 4.8826046815405705e-06, + "loss": 0.8013, + "step": 807 + }, + { + "epoch": 0.4011418642174507, + "grad_norm": 0.13207774630124497, + "learning_rate": 4.882308520328879e-06, + "loss": 0.7445, + "step": 808 + }, + { + "epoch": 0.40163832692069007, + "grad_norm": 0.13638111186259064, + "learning_rate": 4.882011995019529e-06, + "loss": 0.7928, + "step": 809 + }, + { + "epoch": 0.4021347896239295, + "grad_norm": 0.13865080146765882, + "learning_rate": 4.881715105657837e-06, + "loss": 0.7972, + "step": 810 + }, + { + "epoch": 0.4026312523271689, + "grad_norm": 0.14025214556857968, + "learning_rate": 4.88141785228918e-06, + "loss": 0.8052, + "step": 811 + }, + { + "epoch": 0.40312771503040834, + "grad_norm": 0.13674600726600722, + "learning_rate": 4.881120234958986e-06, + "loss": 0.8034, + "step": 812 + }, + { + "epoch": 0.4036241777336478, + "grad_norm": 0.1360386191646674, + "learning_rate": 4.8808222537127436e-06, + "loss": 0.7882, + "step": 813 + }, + { + "epoch": 0.40412064043688717, + "grad_norm": 0.14178158974636348, + "learning_rate": 4.8805239085959936e-06, + "loss": 0.781, + "step": 814 + }, + { + "epoch": 0.4046171031401266, + "grad_norm": 0.1352509016655136, + "learning_rate": 4.880225199654331e-06, + "loss": 0.7834, + "step": 815 + }, + { + "epoch": 0.405113565843366, + "grad_norm": 0.13284596151325054, + "learning_rate": 4.879926126933412e-06, + "loss": 0.7846, + "step": 816 + }, + { + "epoch": 0.40561002854660544, + "grad_norm": 0.13744074794376318, + "learning_rate": 4.8796266904789445e-06, + "loss": 0.7567, + "step": 817 + }, + { + "epoch": 0.4061064912498449, + "grad_norm": 0.13859156903074973, + "learning_rate": 4.8793268903366905e-06, + "loss": 0.793, + "step": 818 + }, + { + "epoch": 0.40660295395308427, + "grad_norm": 0.14032883681460276, + "learning_rate": 4.879026726552471e-06, + "loss": 0.8204, + "step": 819 + }, + { + "epoch": 0.4070994166563237, + "grad_norm": 0.13375088164936702, + "learning_rate": 4.878726199172162e-06, + "loss": 0.7397, + "step": 820 + }, + { + "epoch": 0.4075958793595631, + "grad_norm": 0.13828272771245217, + "learning_rate": 4.878425308241693e-06, + "loss": 0.7657, + "step": 821 + }, + { + "epoch": 0.40809234206280254, + "grad_norm": 0.14138136285113104, + "learning_rate": 4.87812405380705e-06, + "loss": 0.7708, + "step": 822 + }, + { + "epoch": 0.4085888047660419, + "grad_norm": 0.13690922786942494, + "learning_rate": 4.8778224359142775e-06, + "loss": 0.7559, + "step": 823 + }, + { + "epoch": 0.40908526746928137, + "grad_norm": 0.1428920733606056, + "learning_rate": 4.87752045460947e-06, + "loss": 0.7543, + "step": 824 + }, + { + "epoch": 0.4095817301725208, + "grad_norm": 0.13259517168171056, + "learning_rate": 4.877218109938782e-06, + "loss": 0.7468, + "step": 825 + }, + { + "epoch": 0.4100781928757602, + "grad_norm": 0.13286837812829952, + "learning_rate": 4.876915401948421e-06, + "loss": 0.7498, + "step": 826 + }, + { + "epoch": 0.41057465557899964, + "grad_norm": 0.1505604133945408, + "learning_rate": 4.876612330684652e-06, + "loss": 0.8083, + "step": 827 + }, + { + "epoch": 0.411071118282239, + "grad_norm": 0.14013330639827198, + "learning_rate": 4.876308896193795e-06, + "loss": 0.7805, + "step": 828 + }, + { + "epoch": 0.41156758098547847, + "grad_norm": 0.141751989941827, + "learning_rate": 4.876005098522224e-06, + "loss": 0.789, + "step": 829 + }, + { + "epoch": 0.4120640436887179, + "grad_norm": 0.1439633625106176, + "learning_rate": 4.87570093771637e-06, + "loss": 0.7825, + "step": 830 + }, + { + "epoch": 0.4125605063919573, + "grad_norm": 0.13578927525549853, + "learning_rate": 4.875396413822719e-06, + "loss": 0.7434, + "step": 831 + }, + { + "epoch": 0.41305696909519674, + "grad_norm": 0.14301935596353957, + "learning_rate": 4.875091526887813e-06, + "loss": 0.7866, + "step": 832 + }, + { + "epoch": 0.4135534317984361, + "grad_norm": 0.13962888610646845, + "learning_rate": 4.8747862769582485e-06, + "loss": 0.797, + "step": 833 + }, + { + "epoch": 0.41404989450167556, + "grad_norm": 0.13762036122185398, + "learning_rate": 4.874480664080679e-06, + "loss": 0.7647, + "step": 834 + }, + { + "epoch": 0.414546357204915, + "grad_norm": 0.14874331327029203, + "learning_rate": 4.874174688301811e-06, + "loss": 0.7742, + "step": 835 + }, + { + "epoch": 0.4150428199081544, + "grad_norm": 0.14145690460721833, + "learning_rate": 4.873868349668409e-06, + "loss": 0.7794, + "step": 836 + }, + { + "epoch": 0.41553928261139383, + "grad_norm": 0.13516936437887206, + "learning_rate": 4.873561648227292e-06, + "loss": 0.772, + "step": 837 + }, + { + "epoch": 0.4160357453146332, + "grad_norm": 0.13997107273973253, + "learning_rate": 4.873254584025335e-06, + "loss": 0.7802, + "step": 838 + }, + { + "epoch": 0.41653220801787266, + "grad_norm": 0.15576247682108174, + "learning_rate": 4.8729471571094665e-06, + "loss": 0.8451, + "step": 839 + }, + { + "epoch": 0.41702867072111205, + "grad_norm": 0.1391550838396009, + "learning_rate": 4.872639367526672e-06, + "loss": 0.74, + "step": 840 + }, + { + "epoch": 0.4175251334243515, + "grad_norm": 0.13906947337724887, + "learning_rate": 4.872331215323993e-06, + "loss": 0.7883, + "step": 841 + }, + { + "epoch": 0.41802159612759093, + "grad_norm": 0.1369675811107513, + "learning_rate": 4.872022700548525e-06, + "loss": 0.7384, + "step": 842 + }, + { + "epoch": 0.4185180588308303, + "grad_norm": 0.1443749383496745, + "learning_rate": 4.87171382324742e-06, + "loss": 0.773, + "step": 843 + }, + { + "epoch": 0.41901452153406976, + "grad_norm": 0.13524935011822023, + "learning_rate": 4.871404583467884e-06, + "loss": 0.8338, + "step": 844 + }, + { + "epoch": 0.41951098423730915, + "grad_norm": 0.13600833744156696, + "learning_rate": 4.8710949812571805e-06, + "loss": 0.7715, + "step": 845 + }, + { + "epoch": 0.4200074469405486, + "grad_norm": 0.13910800980465646, + "learning_rate": 4.870785016662627e-06, + "loss": 0.7653, + "step": 846 + }, + { + "epoch": 0.42050390964378803, + "grad_norm": 0.14716154207375018, + "learning_rate": 4.870474689731596e-06, + "loss": 0.7753, + "step": 847 + }, + { + "epoch": 0.4210003723470274, + "grad_norm": 0.13651844849366374, + "learning_rate": 4.870164000511516e-06, + "loss": 0.7252, + "step": 848 + }, + { + "epoch": 0.42149683505026686, + "grad_norm": 0.13107938750960044, + "learning_rate": 4.869852949049872e-06, + "loss": 0.7403, + "step": 849 + }, + { + "epoch": 0.42199329775350625, + "grad_norm": 0.1405132580685121, + "learning_rate": 4.8695415353942025e-06, + "loss": 0.7816, + "step": 850 + }, + { + "epoch": 0.4224897604567457, + "grad_norm": 0.16019940582846015, + "learning_rate": 4.869229759592101e-06, + "loss": 0.7977, + "step": 851 + }, + { + "epoch": 0.42298622315998513, + "grad_norm": 0.1386861474814064, + "learning_rate": 4.868917621691219e-06, + "loss": 0.8859, + "step": 852 + }, + { + "epoch": 0.4234826858632245, + "grad_norm": 0.1438580035371236, + "learning_rate": 4.868605121739261e-06, + "loss": 0.7708, + "step": 853 + }, + { + "epoch": 0.42397914856646396, + "grad_norm": 0.13851415036446404, + "learning_rate": 4.868292259783988e-06, + "loss": 0.7379, + "step": 854 + }, + { + "epoch": 0.42447561126970335, + "grad_norm": 0.13928629569080855, + "learning_rate": 4.867979035873216e-06, + "loss": 0.7663, + "step": 855 + }, + { + "epoch": 0.4249720739729428, + "grad_norm": 0.1326009455578934, + "learning_rate": 4.8676654500548156e-06, + "loss": 0.7432, + "step": 856 + }, + { + "epoch": 0.4254685366761822, + "grad_norm": 0.13141434025922472, + "learning_rate": 4.867351502376714e-06, + "loss": 0.7558, + "step": 857 + }, + { + "epoch": 0.4259649993794216, + "grad_norm": 0.1301537847703651, + "learning_rate": 4.867037192886893e-06, + "loss": 0.7003, + "step": 858 + }, + { + "epoch": 0.42646146208266106, + "grad_norm": 0.13958306777392412, + "learning_rate": 4.866722521633389e-06, + "loss": 0.8252, + "step": 859 + }, + { + "epoch": 0.42695792478590044, + "grad_norm": 0.1371175213676672, + "learning_rate": 4.866407488664296e-06, + "loss": 0.7807, + "step": 860 + }, + { + "epoch": 0.4274543874891399, + "grad_norm": 0.1326784777780711, + "learning_rate": 4.866092094027761e-06, + "loss": 0.7701, + "step": 861 + }, + { + "epoch": 0.4279508501923793, + "grad_norm": 0.13589123146778564, + "learning_rate": 4.865776337771986e-06, + "loss": 0.7374, + "step": 862 + }, + { + "epoch": 0.4284473128956187, + "grad_norm": 0.13307442584248397, + "learning_rate": 4.86546021994523e-06, + "loss": 0.7581, + "step": 863 + }, + { + "epoch": 0.42894377559885816, + "grad_norm": 0.12935156456607097, + "learning_rate": 4.865143740595807e-06, + "loss": 0.7259, + "step": 864 + }, + { + "epoch": 0.42944023830209754, + "grad_norm": 0.13856977776450893, + "learning_rate": 4.864826899772086e-06, + "loss": 0.7921, + "step": 865 + }, + { + "epoch": 0.429936701005337, + "grad_norm": 0.13098700749992728, + "learning_rate": 4.864509697522489e-06, + "loss": 0.7581, + "step": 866 + }, + { + "epoch": 0.43043316370857637, + "grad_norm": 0.1277394417952265, + "learning_rate": 4.864192133895499e-06, + "loss": 0.7424, + "step": 867 + }, + { + "epoch": 0.4309296264118158, + "grad_norm": 0.1297413964613426, + "learning_rate": 4.8638742089396464e-06, + "loss": 0.7869, + "step": 868 + }, + { + "epoch": 0.43142608911505526, + "grad_norm": 0.1395569604985153, + "learning_rate": 4.863555922703523e-06, + "loss": 0.7837, + "step": 869 + }, + { + "epoch": 0.43192255181829464, + "grad_norm": 0.14789922672713987, + "learning_rate": 4.863237275235774e-06, + "loss": 0.8032, + "step": 870 + }, + { + "epoch": 0.4324190145215341, + "grad_norm": 0.148230222225613, + "learning_rate": 4.8629182665850995e-06, + "loss": 0.7581, + "step": 871 + }, + { + "epoch": 0.43291547722477347, + "grad_norm": 0.13520814483911175, + "learning_rate": 4.862598896800254e-06, + "loss": 0.7961, + "step": 872 + }, + { + "epoch": 0.4334119399280129, + "grad_norm": 0.1400523186641088, + "learning_rate": 4.862279165930049e-06, + "loss": 0.7443, + "step": 873 + }, + { + "epoch": 0.43390840263125235, + "grad_norm": 0.14000148595084386, + "learning_rate": 4.861959074023348e-06, + "loss": 0.7092, + "step": 874 + }, + { + "epoch": 0.43440486533449174, + "grad_norm": 0.13363969656101354, + "learning_rate": 4.8616386211290755e-06, + "loss": 0.7532, + "step": 875 + }, + { + "epoch": 0.4349013280377312, + "grad_norm": 0.1338415278751024, + "learning_rate": 4.861317807296205e-06, + "loss": 0.7598, + "step": 876 + }, + { + "epoch": 0.43539779074097057, + "grad_norm": 0.13445264018650427, + "learning_rate": 4.860996632573769e-06, + "loss": 0.7517, + "step": 877 + }, + { + "epoch": 0.43589425344421, + "grad_norm": 0.13560845548955244, + "learning_rate": 4.860675097010853e-06, + "loss": 0.7565, + "step": 878 + }, + { + "epoch": 0.4363907161474494, + "grad_norm": 0.14803665445404562, + "learning_rate": 4.860353200656599e-06, + "loss": 0.8122, + "step": 879 + }, + { + "epoch": 0.43688717885068884, + "grad_norm": 0.1373213861484435, + "learning_rate": 4.860030943560204e-06, + "loss": 0.7541, + "step": 880 + }, + { + "epoch": 0.4373836415539283, + "grad_norm": 0.12955793875119787, + "learning_rate": 4.859708325770919e-06, + "loss": 0.7502, + "step": 881 + }, + { + "epoch": 0.43788010425716767, + "grad_norm": 0.1441160209192175, + "learning_rate": 4.859385347338052e-06, + "loss": 0.7734, + "step": 882 + }, + { + "epoch": 0.4383765669604071, + "grad_norm": 0.13942948203768102, + "learning_rate": 4.8590620083109645e-06, + "loss": 0.7528, + "step": 883 + }, + { + "epoch": 0.4388730296636465, + "grad_norm": 0.14399699173850394, + "learning_rate": 4.858738308739073e-06, + "loss": 0.7628, + "step": 884 + }, + { + "epoch": 0.43936949236688594, + "grad_norm": 0.137394867443982, + "learning_rate": 4.858414248671851e-06, + "loss": 0.8637, + "step": 885 + }, + { + "epoch": 0.4398659550701254, + "grad_norm": 0.14448436375711424, + "learning_rate": 4.8580898281588255e-06, + "loss": 0.8147, + "step": 886 + }, + { + "epoch": 0.44036241777336477, + "grad_norm": 0.1341227002790187, + "learning_rate": 4.8577650472495785e-06, + "loss": 0.7602, + "step": 887 + }, + { + "epoch": 0.4408588804766042, + "grad_norm": 0.14377468127267348, + "learning_rate": 4.857439905993748e-06, + "loss": 0.7913, + "step": 888 + }, + { + "epoch": 0.4413553431798436, + "grad_norm": 0.13340052111413453, + "learning_rate": 4.857114404441027e-06, + "loss": 0.7482, + "step": 889 + }, + { + "epoch": 0.44185180588308304, + "grad_norm": 0.13574880212233634, + "learning_rate": 4.856788542641162e-06, + "loss": 0.7771, + "step": 890 + }, + { + "epoch": 0.4423482685863225, + "grad_norm": 0.1394549058341552, + "learning_rate": 4.856462320643957e-06, + "loss": 0.727, + "step": 891 + }, + { + "epoch": 0.44284473128956187, + "grad_norm": 0.16206005047704644, + "learning_rate": 4.856135738499269e-06, + "loss": 0.7901, + "step": 892 + }, + { + "epoch": 0.4433411939928013, + "grad_norm": 0.14512700342806611, + "learning_rate": 4.855808796257012e-06, + "loss": 0.8094, + "step": 893 + }, + { + "epoch": 0.4438376566960407, + "grad_norm": 0.1531915103698981, + "learning_rate": 4.855481493967152e-06, + "loss": 0.8711, + "step": 894 + }, + { + "epoch": 0.44433411939928014, + "grad_norm": 0.14141629017985097, + "learning_rate": 4.855153831679713e-06, + "loss": 0.7529, + "step": 895 + }, + { + "epoch": 0.4448305821025195, + "grad_norm": 0.14793605224512887, + "learning_rate": 4.854825809444773e-06, + "loss": 0.7606, + "step": 896 + }, + { + "epoch": 0.44532704480575896, + "grad_norm": 0.14449987137890363, + "learning_rate": 4.854497427312465e-06, + "loss": 0.8164, + "step": 897 + }, + { + "epoch": 0.4458235075089984, + "grad_norm": 0.14487270020396967, + "learning_rate": 4.854168685332977e-06, + "loss": 0.7833, + "step": 898 + }, + { + "epoch": 0.4463199702122378, + "grad_norm": 0.14100131641479818, + "learning_rate": 4.853839583556551e-06, + "loss": 0.7785, + "step": 899 + }, + { + "epoch": 0.44681643291547724, + "grad_norm": 0.1402338208402533, + "learning_rate": 4.853510122033486e-06, + "loss": 0.7922, + "step": 900 + }, + { + "epoch": 0.4473128956187166, + "grad_norm": 0.15070020553438185, + "learning_rate": 4.853180300814135e-06, + "loss": 0.7818, + "step": 901 + }, + { + "epoch": 0.44780935832195606, + "grad_norm": 0.14340004090339828, + "learning_rate": 4.8528501199489045e-06, + "loss": 0.7831, + "step": 902 + }, + { + "epoch": 0.4483058210251955, + "grad_norm": 0.12886555141368564, + "learning_rate": 4.852519579488258e-06, + "loss": 0.7358, + "step": 903 + }, + { + "epoch": 0.4488022837284349, + "grad_norm": 0.13659666264950565, + "learning_rate": 4.852188679482715e-06, + "loss": 0.797, + "step": 904 + }, + { + "epoch": 0.44929874643167433, + "grad_norm": 0.1344704194549763, + "learning_rate": 4.851857419982845e-06, + "loss": 0.7746, + "step": 905 + }, + { + "epoch": 0.4497952091349137, + "grad_norm": 0.14038386138367254, + "learning_rate": 4.8515258010392786e-06, + "loss": 0.7897, + "step": 906 + }, + { + "epoch": 0.45029167183815316, + "grad_norm": 0.1310362323726338, + "learning_rate": 4.851193822702698e-06, + "loss": 0.7891, + "step": 907 + }, + { + "epoch": 0.4507881345413926, + "grad_norm": 0.13456562483222215, + "learning_rate": 4.850861485023839e-06, + "loss": 0.8137, + "step": 908 + }, + { + "epoch": 0.451284597244632, + "grad_norm": 0.13867498023241415, + "learning_rate": 4.8505287880534954e-06, + "loss": 0.755, + "step": 909 + }, + { + "epoch": 0.45178105994787143, + "grad_norm": 0.13709552350038648, + "learning_rate": 4.8501957318425145e-06, + "loss": 0.7789, + "step": 910 + }, + { + "epoch": 0.4522775226511108, + "grad_norm": 0.20256799536792502, + "learning_rate": 4.849862316441799e-06, + "loss": 0.7916, + "step": 911 + }, + { + "epoch": 0.45277398535435026, + "grad_norm": 0.13805282049183487, + "learning_rate": 4.849528541902304e-06, + "loss": 0.7578, + "step": 912 + }, + { + "epoch": 0.45327044805758965, + "grad_norm": 0.13846494457743488, + "learning_rate": 4.849194408275045e-06, + "loss": 0.7993, + "step": 913 + }, + { + "epoch": 0.4537669107608291, + "grad_norm": 0.13624434622740592, + "learning_rate": 4.848859915611087e-06, + "loss": 0.7218, + "step": 914 + }, + { + "epoch": 0.45426337346406853, + "grad_norm": 0.13735589522446118, + "learning_rate": 4.848525063961551e-06, + "loss": 0.7927, + "step": 915 + }, + { + "epoch": 0.4547598361673079, + "grad_norm": 0.1435760229715099, + "learning_rate": 4.848189853377615e-06, + "loss": 0.7959, + "step": 916 + }, + { + "epoch": 0.45525629887054736, + "grad_norm": 0.1374310227347919, + "learning_rate": 4.8478542839105105e-06, + "loss": 0.8102, + "step": 917 + }, + { + "epoch": 0.45575276157378675, + "grad_norm": 0.13784054445459668, + "learning_rate": 4.847518355611524e-06, + "loss": 0.8705, + "step": 918 + }, + { + "epoch": 0.4562492242770262, + "grad_norm": 0.1429652700234789, + "learning_rate": 4.8471820685319965e-06, + "loss": 0.8273, + "step": 919 + }, + { + "epoch": 0.45674568698026563, + "grad_norm": 0.13776620485826613, + "learning_rate": 4.8468454227233235e-06, + "loss": 0.7659, + "step": 920 + }, + { + "epoch": 0.457242149683505, + "grad_norm": 0.13044770694847052, + "learning_rate": 4.8465084182369566e-06, + "loss": 0.7336, + "step": 921 + }, + { + "epoch": 0.45773861238674446, + "grad_norm": 0.13757280161779586, + "learning_rate": 4.846171055124401e-06, + "loss": 0.7948, + "step": 922 + }, + { + "epoch": 0.45823507508998385, + "grad_norm": 0.1391821959776347, + "learning_rate": 4.845833333437219e-06, + "loss": 0.7553, + "step": 923 + }, + { + "epoch": 0.4587315377932233, + "grad_norm": 0.1412266279982853, + "learning_rate": 4.845495253227023e-06, + "loss": 0.8104, + "step": 924 + }, + { + "epoch": 0.45922800049646273, + "grad_norm": 0.13037017997721642, + "learning_rate": 4.845156814545485e-06, + "loss": 0.755, + "step": 925 + }, + { + "epoch": 0.4597244631997021, + "grad_norm": 0.13938907235913728, + "learning_rate": 4.844818017444331e-06, + "loss": 0.7582, + "step": 926 + }, + { + "epoch": 0.46022092590294156, + "grad_norm": 0.13642206078365174, + "learning_rate": 4.8444788619753375e-06, + "loss": 0.7709, + "step": 927 + }, + { + "epoch": 0.46071738860618094, + "grad_norm": 0.13281685905144847, + "learning_rate": 4.844139348190342e-06, + "loss": 0.7613, + "step": 928 + }, + { + "epoch": 0.4612138513094204, + "grad_norm": 0.1376296253823425, + "learning_rate": 4.843799476141231e-06, + "loss": 0.8199, + "step": 929 + }, + { + "epoch": 0.4617103140126598, + "grad_norm": 0.1430226666279928, + "learning_rate": 4.843459245879952e-06, + "loss": 0.7879, + "step": 930 + }, + { + "epoch": 0.4622067767158992, + "grad_norm": 0.13853491478085572, + "learning_rate": 4.8431186574585e-06, + "loss": 0.799, + "step": 931 + }, + { + "epoch": 0.46270323941913866, + "grad_norm": 0.14106561846325905, + "learning_rate": 4.842777710928932e-06, + "loss": 0.799, + "step": 932 + }, + { + "epoch": 0.46319970212237804, + "grad_norm": 0.13514878380767298, + "learning_rate": 4.842436406343353e-06, + "loss": 0.7928, + "step": 933 + }, + { + "epoch": 0.4636961648256175, + "grad_norm": 0.12994346068694584, + "learning_rate": 4.842094743753929e-06, + "loss": 0.7338, + "step": 934 + }, + { + "epoch": 0.46419262752885687, + "grad_norm": 0.13540353265751, + "learning_rate": 4.841752723212874e-06, + "loss": 0.7365, + "step": 935 + }, + { + "epoch": 0.4646890902320963, + "grad_norm": 0.1383233625428097, + "learning_rate": 4.841410344772464e-06, + "loss": 0.7931, + "step": 936 + }, + { + "epoch": 0.46518555293533576, + "grad_norm": 0.13984478325484884, + "learning_rate": 4.841067608485024e-06, + "loss": 0.8022, + "step": 937 + }, + { + "epoch": 0.46568201563857514, + "grad_norm": 0.13182848523521054, + "learning_rate": 4.840724514402936e-06, + "loss": 0.7207, + "step": 938 + }, + { + "epoch": 0.4661784783418146, + "grad_norm": 0.13720103035170697, + "learning_rate": 4.8403810625786366e-06, + "loss": 0.7699, + "step": 939 + }, + { + "epoch": 0.46667494104505397, + "grad_norm": 0.13805399872377858, + "learning_rate": 4.840037253064617e-06, + "loss": 0.7676, + "step": 940 + }, + { + "epoch": 0.4671714037482934, + "grad_norm": 0.133084203123112, + "learning_rate": 4.839693085913423e-06, + "loss": 0.7366, + "step": 941 + }, + { + "epoch": 0.46766786645153285, + "grad_norm": 0.1338565121410207, + "learning_rate": 4.839348561177656e-06, + "loss": 0.7738, + "step": 942 + }, + { + "epoch": 0.46816432915477224, + "grad_norm": 0.14005655784202467, + "learning_rate": 4.839003678909968e-06, + "loss": 0.8011, + "step": 943 + }, + { + "epoch": 0.4686607918580117, + "grad_norm": 0.13866968197048227, + "learning_rate": 4.838658439163072e-06, + "loss": 0.7869, + "step": 944 + }, + { + "epoch": 0.46915725456125107, + "grad_norm": 0.14490477371225619, + "learning_rate": 4.838312841989731e-06, + "loss": 0.8122, + "step": 945 + }, + { + "epoch": 0.4696537172644905, + "grad_norm": 0.13501188505213316, + "learning_rate": 4.837966887442764e-06, + "loss": 0.8048, + "step": 946 + }, + { + "epoch": 0.4701501799677299, + "grad_norm": 0.13061796475992138, + "learning_rate": 4.837620575575045e-06, + "loss": 0.7505, + "step": 947 + }, + { + "epoch": 0.47064664267096934, + "grad_norm": 0.13123329580779844, + "learning_rate": 4.837273906439501e-06, + "loss": 0.7514, + "step": 948 + }, + { + "epoch": 0.4711431053742088, + "grad_norm": 0.13931347467033603, + "learning_rate": 4.836926880089117e-06, + "loss": 0.7545, + "step": 949 + }, + { + "epoch": 0.47163956807744817, + "grad_norm": 0.13794961687851362, + "learning_rate": 4.836579496576928e-06, + "loss": 0.8052, + "step": 950 + }, + { + "epoch": 0.4721360307806876, + "grad_norm": 0.13580616802094855, + "learning_rate": 4.836231755956028e-06, + "loss": 0.755, + "step": 951 + }, + { + "epoch": 0.472632493483927, + "grad_norm": 0.1375813327214774, + "learning_rate": 4.835883658279562e-06, + "loss": 0.7844, + "step": 952 + }, + { + "epoch": 0.47312895618716644, + "grad_norm": 0.1334419288379686, + "learning_rate": 4.835535203600732e-06, + "loss": 0.7886, + "step": 953 + }, + { + "epoch": 0.4736254188904059, + "grad_norm": 0.13849333665474736, + "learning_rate": 4.835186391972795e-06, + "loss": 0.7719, + "step": 954 + }, + { + "epoch": 0.47412188159364527, + "grad_norm": 0.13814260935997866, + "learning_rate": 4.834837223449058e-06, + "loss": 0.7593, + "step": 955 + }, + { + "epoch": 0.4746183442968847, + "grad_norm": 0.12665160870314987, + "learning_rate": 4.834487698082888e-06, + "loss": 0.6989, + "step": 956 + }, + { + "epoch": 0.4751148070001241, + "grad_norm": 0.13161442511625795, + "learning_rate": 4.834137815927705e-06, + "loss": 0.7502, + "step": 957 + }, + { + "epoch": 0.47561126970336354, + "grad_norm": 0.13088405996355415, + "learning_rate": 4.833787577036981e-06, + "loss": 0.7672, + "step": 958 + }, + { + "epoch": 0.476107732406603, + "grad_norm": 0.14037857434636752, + "learning_rate": 4.833436981464246e-06, + "loss": 0.7766, + "step": 959 + }, + { + "epoch": 0.47660419510984237, + "grad_norm": 0.14031622164392288, + "learning_rate": 4.833086029263081e-06, + "loss": 0.7815, + "step": 960 + }, + { + "epoch": 0.4771006578130818, + "grad_norm": 0.13531078299513247, + "learning_rate": 4.832734720487125e-06, + "loss": 0.7547, + "step": 961 + }, + { + "epoch": 0.4775971205163212, + "grad_norm": 0.13337784164706815, + "learning_rate": 4.8323830551900705e-06, + "loss": 0.7955, + "step": 962 + }, + { + "epoch": 0.47809358321956064, + "grad_norm": 0.1347558468252713, + "learning_rate": 4.832031033425663e-06, + "loss": 0.7486, + "step": 963 + }, + { + "epoch": 0.4785900459228, + "grad_norm": 0.12872155796573553, + "learning_rate": 4.831678655247702e-06, + "loss": 0.7768, + "step": 964 + }, + { + "epoch": 0.47908650862603946, + "grad_norm": 0.1348281268570624, + "learning_rate": 4.831325920710045e-06, + "loss": 0.7895, + "step": 965 + }, + { + "epoch": 0.4795829713292789, + "grad_norm": 0.13513666322029744, + "learning_rate": 4.830972829866601e-06, + "loss": 0.7915, + "step": 966 + }, + { + "epoch": 0.4800794340325183, + "grad_norm": 0.13491478767782572, + "learning_rate": 4.830619382771334e-06, + "loss": 0.7665, + "step": 967 + }, + { + "epoch": 0.48057589673575773, + "grad_norm": 0.13770336867680258, + "learning_rate": 4.830265579478263e-06, + "loss": 0.8023, + "step": 968 + }, + { + "epoch": 0.4810723594389971, + "grad_norm": 0.13544943102350623, + "learning_rate": 4.829911420041461e-06, + "loss": 0.7393, + "step": 969 + }, + { + "epoch": 0.48156882214223656, + "grad_norm": 0.13435327527712077, + "learning_rate": 4.829556904515056e-06, + "loss": 0.7506, + "step": 970 + }, + { + "epoch": 0.482065284845476, + "grad_norm": 0.13966597982630186, + "learning_rate": 4.82920203295323e-06, + "loss": 0.786, + "step": 971 + }, + { + "epoch": 0.4825617475487154, + "grad_norm": 0.1424347215239213, + "learning_rate": 4.828846805410219e-06, + "loss": 0.7934, + "step": 972 + }, + { + "epoch": 0.48305821025195483, + "grad_norm": 0.13234179930319845, + "learning_rate": 4.828491221940313e-06, + "loss": 0.7655, + "step": 973 + }, + { + "epoch": 0.4835546729551942, + "grad_norm": 0.14012843827614116, + "learning_rate": 4.82813528259786e-06, + "loss": 0.7496, + "step": 974 + }, + { + "epoch": 0.48405113565843366, + "grad_norm": 0.13715136273058942, + "learning_rate": 4.827778987437256e-06, + "loss": 0.8375, + "step": 975 + }, + { + "epoch": 0.4845475983616731, + "grad_norm": 0.1947144750999217, + "learning_rate": 4.827422336512958e-06, + "loss": 0.8667, + "step": 976 + }, + { + "epoch": 0.4850440610649125, + "grad_norm": 0.13741283362319834, + "learning_rate": 4.827065329879473e-06, + "loss": 0.8308, + "step": 977 + }, + { + "epoch": 0.48554052376815193, + "grad_norm": 0.13386724756044246, + "learning_rate": 4.826707967591364e-06, + "loss": 0.7704, + "step": 978 + }, + { + "epoch": 0.4860369864713913, + "grad_norm": 0.14441549526350253, + "learning_rate": 4.826350249703249e-06, + "loss": 0.7697, + "step": 979 + }, + { + "epoch": 0.48653344917463076, + "grad_norm": 0.14033203475651918, + "learning_rate": 4.825992176269797e-06, + "loss": 0.8019, + "step": 980 + }, + { + "epoch": 0.48702991187787015, + "grad_norm": 0.13714343033359203, + "learning_rate": 4.825633747345736e-06, + "loss": 0.7529, + "step": 981 + }, + { + "epoch": 0.4875263745811096, + "grad_norm": 0.12941529603226035, + "learning_rate": 4.825274962985845e-06, + "loss": 0.7562, + "step": 982 + }, + { + "epoch": 0.48802283728434903, + "grad_norm": 0.13974390480489013, + "learning_rate": 4.82491582324496e-06, + "loss": 0.8017, + "step": 983 + }, + { + "epoch": 0.4885192999875884, + "grad_norm": 0.1366449796259129, + "learning_rate": 4.824556328177968e-06, + "loss": 0.7515, + "step": 984 + }, + { + "epoch": 0.48901576269082786, + "grad_norm": 0.14711958138650388, + "learning_rate": 4.824196477839812e-06, + "loss": 0.795, + "step": 985 + }, + { + "epoch": 0.48951222539406725, + "grad_norm": 0.14257192599317975, + "learning_rate": 4.8238362722854905e-06, + "loss": 0.7937, + "step": 986 + }, + { + "epoch": 0.4900086880973067, + "grad_norm": 0.1347130511121721, + "learning_rate": 4.823475711570055e-06, + "loss": 0.7963, + "step": 987 + }, + { + "epoch": 0.49050515080054613, + "grad_norm": 0.13038662073702864, + "learning_rate": 4.823114795748611e-06, + "loss": 0.7671, + "step": 988 + }, + { + "epoch": 0.4910016135037855, + "grad_norm": 0.14016752297712926, + "learning_rate": 4.8227535248763185e-06, + "loss": 0.7237, + "step": 989 + }, + { + "epoch": 0.49149807620702496, + "grad_norm": 0.14127823957650343, + "learning_rate": 4.8223918990083925e-06, + "loss": 0.8157, + "step": 990 + }, + { + "epoch": 0.49199453891026435, + "grad_norm": 0.15260187773192319, + "learning_rate": 4.8220299182001014e-06, + "loss": 0.7994, + "step": 991 + }, + { + "epoch": 0.4924910016135038, + "grad_norm": 0.13943785694471836, + "learning_rate": 4.821667582506768e-06, + "loss": 0.7663, + "step": 992 + }, + { + "epoch": 0.49298746431674323, + "grad_norm": 0.13275215903917895, + "learning_rate": 4.8213048919837694e-06, + "loss": 0.7614, + "step": 993 + }, + { + "epoch": 0.4934839270199826, + "grad_norm": 0.14101255565793344, + "learning_rate": 4.820941846686538e-06, + "loss": 0.7854, + "step": 994 + }, + { + "epoch": 0.49398038972322206, + "grad_norm": 0.13656223313177246, + "learning_rate": 4.820578446670559e-06, + "loss": 0.7944, + "step": 995 + }, + { + "epoch": 0.49447685242646144, + "grad_norm": 0.13690268510235684, + "learning_rate": 4.820214691991372e-06, + "loss": 0.7417, + "step": 996 + }, + { + "epoch": 0.4949733151297009, + "grad_norm": 0.14131395637674096, + "learning_rate": 4.81985058270457e-06, + "loss": 0.798, + "step": 997 + }, + { + "epoch": 0.49546977783294033, + "grad_norm": 0.13522955049735227, + "learning_rate": 4.819486118865804e-06, + "loss": 0.7744, + "step": 998 + }, + { + "epoch": 0.4959662405361797, + "grad_norm": 0.13340560525311815, + "learning_rate": 4.819121300530774e-06, + "loss": 0.7248, + "step": 999 + }, + { + "epoch": 0.49646270323941916, + "grad_norm": 0.14852492729160868, + "learning_rate": 4.8187561277552376e-06, + "loss": 0.8108, + "step": 1000 + }, + { + "epoch": 0.49695916594265854, + "grad_norm": 0.14319486748419052, + "learning_rate": 4.818390600595005e-06, + "loss": 0.7618, + "step": 1001 + }, + { + "epoch": 0.497455628645898, + "grad_norm": 0.13608599816764147, + "learning_rate": 4.818024719105942e-06, + "loss": 0.7732, + "step": 1002 + }, + { + "epoch": 0.49795209134913737, + "grad_norm": 0.13154303448443375, + "learning_rate": 4.817658483343967e-06, + "loss": 0.7478, + "step": 1003 + }, + { + "epoch": 0.4984485540523768, + "grad_norm": 0.13240308227842207, + "learning_rate": 4.817291893365055e-06, + "loss": 0.7901, + "step": 1004 + }, + { + "epoch": 0.49894501675561626, + "grad_norm": 0.14667939413445374, + "learning_rate": 4.816924949225231e-06, + "loss": 0.7964, + "step": 1005 + }, + { + "epoch": 0.49944147945885564, + "grad_norm": 0.13188227123187843, + "learning_rate": 4.816557650980578e-06, + "loss": 0.6981, + "step": 1006 + }, + { + "epoch": 0.4999379421620951, + "grad_norm": 0.12344883892983644, + "learning_rate": 4.816189998687231e-06, + "loss": 0.717, + "step": 1007 + }, + { + "epoch": 0.5004344048653345, + "grad_norm": 0.14130934547471188, + "learning_rate": 4.81582199240138e-06, + "loss": 0.7951, + "step": 1008 + }, + { + "epoch": 0.5004344048653345, + "eval_loss": 0.7746918201446533, + "eval_runtime": 135.3234, + "eval_samples_per_second": 224.3, + "eval_steps_per_second": 28.044, + "step": 1008 + }, + { + "epoch": 0.5009308675685739, + "grad_norm": 0.134048946932314, + "learning_rate": 4.815453632179269e-06, + "loss": 0.7824, + "step": 1009 + }, + { + "epoch": 0.5014273302718133, + "grad_norm": 0.13698117562535206, + "learning_rate": 4.815084918077196e-06, + "loss": 0.8098, + "step": 1010 + }, + { + "epoch": 0.5019237929750527, + "grad_norm": 0.14425364984140046, + "learning_rate": 4.8147158501515125e-06, + "loss": 0.7688, + "step": 1011 + }, + { + "epoch": 0.5024202556782922, + "grad_norm": 0.1332199024954632, + "learning_rate": 4.814346428458624e-06, + "loss": 0.7414, + "step": 1012 + }, + { + "epoch": 0.5029167183815316, + "grad_norm": 0.1325832205983266, + "learning_rate": 4.813976653054993e-06, + "loss": 0.8369, + "step": 1013 + }, + { + "epoch": 0.503413181084771, + "grad_norm": 0.13394042417772226, + "learning_rate": 4.813606523997132e-06, + "loss": 0.7683, + "step": 1014 + }, + { + "epoch": 0.5039096437880104, + "grad_norm": 0.15382272131461908, + "learning_rate": 4.81323604134161e-06, + "loss": 0.7601, + "step": 1015 + }, + { + "epoch": 0.5044061064912498, + "grad_norm": 0.14301524433648288, + "learning_rate": 4.8128652051450485e-06, + "loss": 0.7507, + "step": 1016 + }, + { + "epoch": 0.5049025691944893, + "grad_norm": 0.13250662485949544, + "learning_rate": 4.812494015464124e-06, + "loss": 0.7627, + "step": 1017 + }, + { + "epoch": 0.5053990318977287, + "grad_norm": 0.136342520846831, + "learning_rate": 4.812122472355569e-06, + "loss": 0.7467, + "step": 1018 + }, + { + "epoch": 0.505895494600968, + "grad_norm": 0.1487816899919886, + "learning_rate": 4.811750575876164e-06, + "loss": 0.8047, + "step": 1019 + }, + { + "epoch": 0.5063919573042075, + "grad_norm": 0.13428718634545492, + "learning_rate": 4.811378326082751e-06, + "loss": 0.7221, + "step": 1020 + }, + { + "epoch": 0.5068884200074469, + "grad_norm": 0.1439447750897544, + "learning_rate": 4.811005723032219e-06, + "loss": 0.7775, + "step": 1021 + }, + { + "epoch": 0.5073848827106864, + "grad_norm": 0.13831804614048945, + "learning_rate": 4.810632766781519e-06, + "loss": 0.7779, + "step": 1022 + }, + { + "epoch": 0.5078813454139258, + "grad_norm": 0.1322159420042874, + "learning_rate": 4.810259457387647e-06, + "loss": 0.7703, + "step": 1023 + }, + { + "epoch": 0.5083778081171652, + "grad_norm": 0.13665579834505667, + "learning_rate": 4.80988579490766e-06, + "loss": 0.773, + "step": 1024 + }, + { + "epoch": 0.5088742708204046, + "grad_norm": 0.13881136535201452, + "learning_rate": 4.809511779398665e-06, + "loss": 0.7572, + "step": 1025 + }, + { + "epoch": 0.509370733523644, + "grad_norm": 0.12477421280336831, + "learning_rate": 4.809137410917825e-06, + "loss": 0.6892, + "step": 1026 + }, + { + "epoch": 0.5098671962268835, + "grad_norm": 0.1333532065223667, + "learning_rate": 4.808762689522356e-06, + "loss": 0.7785, + "step": 1027 + }, + { + "epoch": 0.5103636589301229, + "grad_norm": 0.139072803813187, + "learning_rate": 4.8083876152695285e-06, + "loss": 0.7882, + "step": 1028 + }, + { + "epoch": 0.5108601216333623, + "grad_norm": 0.13361520481016406, + "learning_rate": 4.808012188216665e-06, + "loss": 0.7488, + "step": 1029 + }, + { + "epoch": 0.5113565843366017, + "grad_norm": 0.13878508065904147, + "learning_rate": 4.807636408421146e-06, + "loss": 0.8114, + "step": 1030 + }, + { + "epoch": 0.5118530470398411, + "grad_norm": 0.13851247233308997, + "learning_rate": 4.807260275940401e-06, + "loss": 0.8061, + "step": 1031 + }, + { + "epoch": 0.5123495097430806, + "grad_norm": 0.1373268203823735, + "learning_rate": 4.806883790831918e-06, + "loss": 0.7767, + "step": 1032 + }, + { + "epoch": 0.51284597244632, + "grad_norm": 0.13370723442504853, + "learning_rate": 4.806506953153235e-06, + "loss": 0.7912, + "step": 1033 + }, + { + "epoch": 0.5133424351495594, + "grad_norm": 0.1339473130089365, + "learning_rate": 4.806129762961946e-06, + "loss": 0.7391, + "step": 1034 + }, + { + "epoch": 0.5138388978527988, + "grad_norm": 0.15400773010538693, + "learning_rate": 4.805752220315699e-06, + "loss": 0.7607, + "step": 1035 + }, + { + "epoch": 0.5143353605560382, + "grad_norm": 0.13291573497843698, + "learning_rate": 4.8053743252721954e-06, + "loss": 0.7504, + "step": 1036 + }, + { + "epoch": 0.5148318232592777, + "grad_norm": 0.14283555501521533, + "learning_rate": 4.804996077889189e-06, + "loss": 0.8104, + "step": 1037 + }, + { + "epoch": 0.5153282859625171, + "grad_norm": 0.13534285097280963, + "learning_rate": 4.8046174782244915e-06, + "loss": 0.7844, + "step": 1038 + }, + { + "epoch": 0.5158247486657564, + "grad_norm": 0.14292129810329937, + "learning_rate": 4.804238526335963e-06, + "loss": 0.7838, + "step": 1039 + }, + { + "epoch": 0.5163212113689959, + "grad_norm": 0.13392734203551412, + "learning_rate": 4.803859222281522e-06, + "loss": 0.7902, + "step": 1040 + }, + { + "epoch": 0.5168176740722353, + "grad_norm": 0.19974575949479675, + "learning_rate": 4.803479566119138e-06, + "loss": 0.7953, + "step": 1041 + }, + { + "epoch": 0.5173141367754748, + "grad_norm": 0.13159644138104465, + "learning_rate": 4.803099557906836e-06, + "loss": 0.769, + "step": 1042 + }, + { + "epoch": 0.5178105994787141, + "grad_norm": 0.1388552332573298, + "learning_rate": 4.802719197702694e-06, + "loss": 0.8074, + "step": 1043 + }, + { + "epoch": 0.5183070621819535, + "grad_norm": 0.13679177955633, + "learning_rate": 4.802338485564843e-06, + "loss": 0.7777, + "step": 1044 + }, + { + "epoch": 0.518803524885193, + "grad_norm": 0.13979496307917497, + "learning_rate": 4.8019574215514705e-06, + "loss": 0.8268, + "step": 1045 + }, + { + "epoch": 0.5192999875884324, + "grad_norm": 0.12800445973569957, + "learning_rate": 4.801576005720816e-06, + "loss": 0.7296, + "step": 1046 + }, + { + "epoch": 0.5197964502916719, + "grad_norm": 0.13332186476475139, + "learning_rate": 4.801194238131171e-06, + "loss": 0.7315, + "step": 1047 + }, + { + "epoch": 0.5202929129949112, + "grad_norm": 0.13956881166358395, + "learning_rate": 4.800812118840884e-06, + "loss": 0.7848, + "step": 1048 + }, + { + "epoch": 0.5207893756981506, + "grad_norm": 0.14486219818800664, + "learning_rate": 4.800429647908354e-06, + "loss": 0.7346, + "step": 1049 + }, + { + "epoch": 0.5212858384013901, + "grad_norm": 0.13700325136931357, + "learning_rate": 4.800046825392039e-06, + "loss": 0.7827, + "step": 1050 + }, + { + "epoch": 0.5217823011046295, + "grad_norm": 0.13238624342923797, + "learning_rate": 4.799663651350444e-06, + "loss": 0.737, + "step": 1051 + }, + { + "epoch": 0.522278763807869, + "grad_norm": 0.1409117185449216, + "learning_rate": 4.799280125842133e-06, + "loss": 0.7865, + "step": 1052 + }, + { + "epoch": 0.5227752265111083, + "grad_norm": 0.16712731559406424, + "learning_rate": 4.79889624892572e-06, + "loss": 0.7599, + "step": 1053 + }, + { + "epoch": 0.5232716892143477, + "grad_norm": 0.14064208506068193, + "learning_rate": 4.798512020659876e-06, + "loss": 0.7839, + "step": 1054 + }, + { + "epoch": 0.5237681519175872, + "grad_norm": 0.13461779032052776, + "learning_rate": 4.7981274411033225e-06, + "loss": 0.7552, + "step": 1055 + }, + { + "epoch": 0.5242646146208266, + "grad_norm": 0.13913729985952666, + "learning_rate": 4.797742510314838e-06, + "loss": 0.7936, + "step": 1056 + }, + { + "epoch": 0.5247610773240661, + "grad_norm": 0.136526668795062, + "learning_rate": 4.797357228353252e-06, + "loss": 0.7271, + "step": 1057 + }, + { + "epoch": 0.5252575400273054, + "grad_norm": 0.13602099516938532, + "learning_rate": 4.796971595277449e-06, + "loss": 0.7623, + "step": 1058 + }, + { + "epoch": 0.5257540027305448, + "grad_norm": 0.13692035663882426, + "learning_rate": 4.796585611146367e-06, + "loss": 0.8187, + "step": 1059 + }, + { + "epoch": 0.5262504654337843, + "grad_norm": 0.13511454024474553, + "learning_rate": 4.7961992760189975e-06, + "loss": 0.7717, + "step": 1060 + }, + { + "epoch": 0.5267469281370237, + "grad_norm": 0.13653868560447394, + "learning_rate": 4.795812589954385e-06, + "loss": 0.7493, + "step": 1061 + }, + { + "epoch": 0.5272433908402632, + "grad_norm": 0.14284294774022338, + "learning_rate": 4.795425553011629e-06, + "loss": 0.7828, + "step": 1062 + }, + { + "epoch": 0.5277398535435025, + "grad_norm": 0.13641287616715173, + "learning_rate": 4.795038165249882e-06, + "loss": 0.7575, + "step": 1063 + }, + { + "epoch": 0.5282363162467419, + "grad_norm": 0.13399693271167426, + "learning_rate": 4.794650426728349e-06, + "loss": 0.7907, + "step": 1064 + }, + { + "epoch": 0.5287327789499814, + "grad_norm": 0.13237777235979964, + "learning_rate": 4.79426233750629e-06, + "loss": 0.7839, + "step": 1065 + }, + { + "epoch": 0.5292292416532208, + "grad_norm": 0.1390129774707514, + "learning_rate": 4.793873897643019e-06, + "loss": 0.7499, + "step": 1066 + }, + { + "epoch": 0.5297257043564603, + "grad_norm": 0.15109478098320664, + "learning_rate": 4.793485107197902e-06, + "loss": 0.7653, + "step": 1067 + }, + { + "epoch": 0.5302221670596996, + "grad_norm": 0.14002670175893508, + "learning_rate": 4.793095966230359e-06, + "loss": 0.7565, + "step": 1068 + }, + { + "epoch": 0.530718629762939, + "grad_norm": 0.13739224347859277, + "learning_rate": 4.792706474799865e-06, + "loss": 0.698, + "step": 1069 + }, + { + "epoch": 0.5312150924661785, + "grad_norm": 0.14191602937165326, + "learning_rate": 4.792316632965947e-06, + "loss": 0.7827, + "step": 1070 + }, + { + "epoch": 0.5317115551694179, + "grad_norm": 0.13560549121655127, + "learning_rate": 4.791926440788186e-06, + "loss": 0.7651, + "step": 1071 + }, + { + "epoch": 0.5322080178726574, + "grad_norm": 0.1476565893427292, + "learning_rate": 4.791535898326217e-06, + "loss": 0.7883, + "step": 1072 + }, + { + "epoch": 0.5327044805758967, + "grad_norm": 0.13008136272495874, + "learning_rate": 4.791145005639729e-06, + "loss": 0.78, + "step": 1073 + }, + { + "epoch": 0.5332009432791361, + "grad_norm": 0.1327752708528825, + "learning_rate": 4.790753762788461e-06, + "loss": 0.7236, + "step": 1074 + }, + { + "epoch": 0.5336974059823756, + "grad_norm": 0.14247698450976287, + "learning_rate": 4.79036216983221e-06, + "loss": 0.7687, + "step": 1075 + }, + { + "epoch": 0.534193868685615, + "grad_norm": 0.13898239220917705, + "learning_rate": 4.789970226830825e-06, + "loss": 0.7758, + "step": 1076 + }, + { + "epoch": 0.5346903313888545, + "grad_norm": 0.13280694141514973, + "learning_rate": 4.789577933844207e-06, + "loss": 0.7538, + "step": 1077 + }, + { + "epoch": 0.5351867940920938, + "grad_norm": 0.14265951347289108, + "learning_rate": 4.7891852909323135e-06, + "loss": 0.7814, + "step": 1078 + }, + { + "epoch": 0.5356832567953332, + "grad_norm": 0.13154992991885675, + "learning_rate": 4.788792298155152e-06, + "loss": 0.7445, + "step": 1079 + }, + { + "epoch": 0.5361797194985727, + "grad_norm": 0.13519381517124315, + "learning_rate": 4.788398955572786e-06, + "loss": 0.7487, + "step": 1080 + }, + { + "epoch": 0.5366761822018121, + "grad_norm": 0.13044574075599727, + "learning_rate": 4.788005263245331e-06, + "loss": 0.7455, + "step": 1081 + }, + { + "epoch": 0.5371726449050515, + "grad_norm": 0.13266919082399994, + "learning_rate": 4.787611221232957e-06, + "loss": 0.7757, + "step": 1082 + }, + { + "epoch": 0.5376691076082909, + "grad_norm": 0.13376136036691982, + "learning_rate": 4.787216829595887e-06, + "loss": 0.7519, + "step": 1083 + }, + { + "epoch": 0.5381655703115303, + "grad_norm": 0.13418604670303574, + "learning_rate": 4.786822088394397e-06, + "loss": 0.7926, + "step": 1084 + }, + { + "epoch": 0.5386620330147698, + "grad_norm": 0.13482820857673963, + "learning_rate": 4.786426997688817e-06, + "loss": 0.7779, + "step": 1085 + }, + { + "epoch": 0.5391584957180092, + "grad_norm": 0.130173504550406, + "learning_rate": 4.786031557539532e-06, + "loss": 0.717, + "step": 1086 + }, + { + "epoch": 0.5396549584212486, + "grad_norm": 0.14790797249373314, + "learning_rate": 4.785635768006975e-06, + "loss": 0.8366, + "step": 1087 + }, + { + "epoch": 0.540151421124488, + "grad_norm": 0.13839226189444587, + "learning_rate": 4.78523962915164e-06, + "loss": 0.7947, + "step": 1088 + }, + { + "epoch": 0.5406478838277274, + "grad_norm": 0.13079738699092058, + "learning_rate": 4.784843141034068e-06, + "loss": 0.7289, + "step": 1089 + }, + { + "epoch": 0.5411443465309669, + "grad_norm": 0.13393354009715142, + "learning_rate": 4.784446303714856e-06, + "loss": 0.7756, + "step": 1090 + }, + { + "epoch": 0.5416408092342063, + "grad_norm": 0.14144453676659285, + "learning_rate": 4.784049117254656e-06, + "loss": 0.793, + "step": 1091 + }, + { + "epoch": 0.5421372719374457, + "grad_norm": 0.1300964867769227, + "learning_rate": 4.783651581714169e-06, + "loss": 0.7597, + "step": 1092 + }, + { + "epoch": 0.5426337346406851, + "grad_norm": 0.1357913073042723, + "learning_rate": 4.7832536971541546e-06, + "loss": 0.788, + "step": 1093 + }, + { + "epoch": 0.5431301973439245, + "grad_norm": 0.13222437636600876, + "learning_rate": 4.7828554636354216e-06, + "loss": 0.7809, + "step": 1094 + }, + { + "epoch": 0.543626660047164, + "grad_norm": 0.13258977130957175, + "learning_rate": 4.782456881218834e-06, + "loss": 0.7448, + "step": 1095 + }, + { + "epoch": 0.5441231227504034, + "grad_norm": 0.13730375474418763, + "learning_rate": 4.782057949965307e-06, + "loss": 0.7784, + "step": 1096 + }, + { + "epoch": 0.5446195854536428, + "grad_norm": 0.13642288559333154, + "learning_rate": 4.781658669935813e-06, + "loss": 0.781, + "step": 1097 + }, + { + "epoch": 0.5451160481568822, + "grad_norm": 0.1401061103174583, + "learning_rate": 4.7812590411913755e-06, + "loss": 0.7655, + "step": 1098 + }, + { + "epoch": 0.5456125108601216, + "grad_norm": 0.13382843638430478, + "learning_rate": 4.780859063793071e-06, + "loss": 0.761, + "step": 1099 + }, + { + "epoch": 0.5461089735633611, + "grad_norm": 0.13467015132419832, + "learning_rate": 4.780458737802028e-06, + "loss": 0.8425, + "step": 1100 + }, + { + "epoch": 0.5466054362666005, + "grad_norm": 0.13538805042285004, + "learning_rate": 4.780058063279432e-06, + "loss": 0.8023, + "step": 1101 + }, + { + "epoch": 0.5471018989698399, + "grad_norm": 0.13332827992090326, + "learning_rate": 4.779657040286519e-06, + "loss": 0.7454, + "step": 1102 + }, + { + "epoch": 0.5475983616730793, + "grad_norm": 0.13714991044262492, + "learning_rate": 4.779255668884579e-06, + "loss": 0.7781, + "step": 1103 + }, + { + "epoch": 0.5480948243763187, + "grad_norm": 0.13230879275054402, + "learning_rate": 4.778853949134956e-06, + "loss": 0.7512, + "step": 1104 + }, + { + "epoch": 0.5485912870795582, + "grad_norm": 0.13456603648064147, + "learning_rate": 4.778451881099045e-06, + "loss": 0.8209, + "step": 1105 + }, + { + "epoch": 0.5490877497827976, + "grad_norm": 0.1308167779233721, + "learning_rate": 4.7780494648382955e-06, + "loss": 0.764, + "step": 1106 + }, + { + "epoch": 0.549584212486037, + "grad_norm": 0.16101308796343475, + "learning_rate": 4.777646700414211e-06, + "loss": 0.7786, + "step": 1107 + }, + { + "epoch": 0.5500806751892764, + "grad_norm": 0.1459790001808385, + "learning_rate": 4.777243587888348e-06, + "loss": 0.7868, + "step": 1108 + }, + { + "epoch": 0.5505771378925158, + "grad_norm": 0.13657235299783674, + "learning_rate": 4.776840127322316e-06, + "loss": 0.8113, + "step": 1109 + }, + { + "epoch": 0.5510736005957553, + "grad_norm": 0.14294798523667074, + "learning_rate": 4.7764363187777765e-06, + "loss": 0.7649, + "step": 1110 + }, + { + "epoch": 0.5515700632989947, + "grad_norm": 0.13846612184838433, + "learning_rate": 4.776032162316445e-06, + "loss": 0.7307, + "step": 1111 + }, + { + "epoch": 0.552066526002234, + "grad_norm": 0.14690601721943153, + "learning_rate": 4.775627658000091e-06, + "loss": 0.7695, + "step": 1112 + }, + { + "epoch": 0.5525629887054735, + "grad_norm": 0.14581794719191593, + "learning_rate": 4.775222805890537e-06, + "loss": 0.8154, + "step": 1113 + }, + { + "epoch": 0.5530594514087129, + "grad_norm": 0.14444888719097665, + "learning_rate": 4.7748176060496574e-06, + "loss": 0.8383, + "step": 1114 + }, + { + "epoch": 0.5535559141119524, + "grad_norm": 0.13254092300237755, + "learning_rate": 4.77441205853938e-06, + "loss": 0.8386, + "step": 1115 + }, + { + "epoch": 0.5540523768151917, + "grad_norm": 0.1347220636449837, + "learning_rate": 4.774006163421687e-06, + "loss": 0.752, + "step": 1116 + }, + { + "epoch": 0.5545488395184311, + "grad_norm": 0.13548044148354343, + "learning_rate": 4.773599920758614e-06, + "loss": 0.7409, + "step": 1117 + }, + { + "epoch": 0.5550453022216706, + "grad_norm": 0.14102047024443465, + "learning_rate": 4.773193330612246e-06, + "loss": 0.7561, + "step": 1118 + }, + { + "epoch": 0.55554176492491, + "grad_norm": 0.13374789735214362, + "learning_rate": 4.772786393044726e-06, + "loss": 0.7837, + "step": 1119 + }, + { + "epoch": 0.5560382276281495, + "grad_norm": 0.13483434548142853, + "learning_rate": 4.772379108118247e-06, + "loss": 0.7457, + "step": 1120 + }, + { + "epoch": 0.5565346903313888, + "grad_norm": 0.13493609143971075, + "learning_rate": 4.7719714758950565e-06, + "loss": 0.7414, + "step": 1121 + }, + { + "epoch": 0.5570311530346282, + "grad_norm": 0.14080955544969456, + "learning_rate": 4.771563496437454e-06, + "loss": 0.7954, + "step": 1122 + }, + { + "epoch": 0.5575276157378677, + "grad_norm": 0.15315063885453276, + "learning_rate": 4.771155169807793e-06, + "loss": 0.787, + "step": 1123 + }, + { + "epoch": 0.5580240784411071, + "grad_norm": 0.1343038161486333, + "learning_rate": 4.770746496068479e-06, + "loss": 0.7636, + "step": 1124 + }, + { + "epoch": 0.5585205411443466, + "grad_norm": 0.1323279813259488, + "learning_rate": 4.770337475281972e-06, + "loss": 0.7246, + "step": 1125 + }, + { + "epoch": 0.5590170038475859, + "grad_norm": 0.12987821538488584, + "learning_rate": 4.769928107510784e-06, + "loss": 0.753, + "step": 1126 + }, + { + "epoch": 0.5595134665508253, + "grad_norm": 0.14121741134888693, + "learning_rate": 4.7695183928174804e-06, + "loss": 0.7721, + "step": 1127 + }, + { + "epoch": 0.5600099292540648, + "grad_norm": 0.13095267808109395, + "learning_rate": 4.76910833126468e-06, + "loss": 0.7727, + "step": 1128 + }, + { + "epoch": 0.5605063919573042, + "grad_norm": 0.13160852813062984, + "learning_rate": 4.768697922915053e-06, + "loss": 0.7499, + "step": 1129 + }, + { + "epoch": 0.5610028546605437, + "grad_norm": 0.13131472336248304, + "learning_rate": 4.768287167831323e-06, + "loss": 0.7271, + "step": 1130 + }, + { + "epoch": 0.561499317363783, + "grad_norm": 0.14344145034960104, + "learning_rate": 4.767876066076271e-06, + "loss": 0.8335, + "step": 1131 + }, + { + "epoch": 0.5619957800670224, + "grad_norm": 0.13562056297798028, + "learning_rate": 4.7674646177127236e-06, + "loss": 0.8303, + "step": 1132 + }, + { + "epoch": 0.5624922427702619, + "grad_norm": 0.1397414253999263, + "learning_rate": 4.767052822803565e-06, + "loss": 0.7954, + "step": 1133 + }, + { + "epoch": 0.5629887054735013, + "grad_norm": 0.13474425151028716, + "learning_rate": 4.7666406814117324e-06, + "loss": 0.7554, + "step": 1134 + }, + { + "epoch": 0.5634851681767408, + "grad_norm": 0.13073363088146278, + "learning_rate": 4.7662281936002155e-06, + "loss": 0.7659, + "step": 1135 + }, + { + "epoch": 0.5639816308799801, + "grad_norm": 0.13266775105459727, + "learning_rate": 4.765815359432054e-06, + "loss": 0.8103, + "step": 1136 + }, + { + "epoch": 0.5644780935832195, + "grad_norm": 0.14135466000280938, + "learning_rate": 4.765402178970345e-06, + "loss": 0.7477, + "step": 1137 + }, + { + "epoch": 0.564974556286459, + "grad_norm": 0.1364663264168024, + "learning_rate": 4.764988652278235e-06, + "loss": 0.7801, + "step": 1138 + }, + { + "epoch": 0.5654710189896984, + "grad_norm": 0.133069946323726, + "learning_rate": 4.764574779418927e-06, + "loss": 0.7814, + "step": 1139 + }, + { + "epoch": 0.5659674816929379, + "grad_norm": 0.13667566517665522, + "learning_rate": 4.7641605604556725e-06, + "loss": 0.7193, + "step": 1140 + }, + { + "epoch": 0.5664639443961772, + "grad_norm": 0.14023909922018574, + "learning_rate": 4.763745995451781e-06, + "loss": 0.7794, + "step": 1141 + }, + { + "epoch": 0.5669604070994166, + "grad_norm": 0.1343364576457988, + "learning_rate": 4.763331084470609e-06, + "loss": 0.7651, + "step": 1142 + }, + { + "epoch": 0.5674568698026561, + "grad_norm": 0.13304454158801782, + "learning_rate": 4.762915827575571e-06, + "loss": 0.741, + "step": 1143 + }, + { + "epoch": 0.5679533325058955, + "grad_norm": 0.13140270388519174, + "learning_rate": 4.762500224830132e-06, + "loss": 0.7353, + "step": 1144 + }, + { + "epoch": 0.568449795209135, + "grad_norm": 0.13074327953386966, + "learning_rate": 4.762084276297811e-06, + "loss": 0.765, + "step": 1145 + }, + { + "epoch": 0.5689462579123743, + "grad_norm": 0.13281986443064125, + "learning_rate": 4.761667982042176e-06, + "loss": 0.7372, + "step": 1146 + }, + { + "epoch": 0.5694427206156137, + "grad_norm": 0.13419581835055136, + "learning_rate": 4.7612513421268546e-06, + "loss": 0.7767, + "step": 1147 + }, + { + "epoch": 0.5699391833188532, + "grad_norm": 0.13612919993671027, + "learning_rate": 4.760834356615521e-06, + "loss": 0.7834, + "step": 1148 + }, + { + "epoch": 0.5704356460220926, + "grad_norm": 0.13693184772625464, + "learning_rate": 4.760417025571907e-06, + "loss": 0.7544, + "step": 1149 + }, + { + "epoch": 0.570932108725332, + "grad_norm": 0.13395312495200518, + "learning_rate": 4.759999349059793e-06, + "loss": 0.7411, + "step": 1150 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.1481697018642554, + "learning_rate": 4.759581327143015e-06, + "loss": 0.7968, + "step": 1151 + }, + { + "epoch": 0.5719250341318108, + "grad_norm": 0.1371234108294726, + "learning_rate": 4.7591629598854595e-06, + "loss": 0.8194, + "step": 1152 + }, + { + "epoch": 0.5724214968350503, + "grad_norm": 0.13604199584938104, + "learning_rate": 4.7587442473510705e-06, + "loss": 0.7528, + "step": 1153 + }, + { + "epoch": 0.5729179595382897, + "grad_norm": 0.1346379831014206, + "learning_rate": 4.758325189603838e-06, + "loss": 0.7171, + "step": 1154 + }, + { + "epoch": 0.573414422241529, + "grad_norm": 0.14237506813976503, + "learning_rate": 4.757905786707811e-06, + "loss": 0.7842, + "step": 1155 + }, + { + "epoch": 0.5739108849447685, + "grad_norm": 0.13403550334730185, + "learning_rate": 4.757486038727086e-06, + "loss": 0.7931, + "step": 1156 + }, + { + "epoch": 0.5744073476480079, + "grad_norm": 0.135231376685361, + "learning_rate": 4.757065945725816e-06, + "loss": 0.7154, + "step": 1157 + }, + { + "epoch": 0.5749038103512474, + "grad_norm": 0.13684138959608425, + "learning_rate": 4.756645507768207e-06, + "loss": 0.7383, + "step": 1158 + }, + { + "epoch": 0.5754002730544868, + "grad_norm": 0.1460330953509727, + "learning_rate": 4.756224724918513e-06, + "loss": 0.7872, + "step": 1159 + }, + { + "epoch": 0.5758967357577262, + "grad_norm": 0.1318765034689836, + "learning_rate": 4.755803597241047e-06, + "loss": 0.768, + "step": 1160 + }, + { + "epoch": 0.5763931984609656, + "grad_norm": 0.13719986250467792, + "learning_rate": 4.755382124800169e-06, + "loss": 0.7453, + "step": 1161 + }, + { + "epoch": 0.576889661164205, + "grad_norm": 0.14235485272473405, + "learning_rate": 4.754960307660296e-06, + "loss": 0.7821, + "step": 1162 + }, + { + "epoch": 0.5773861238674445, + "grad_norm": 0.1356872066401077, + "learning_rate": 4.754538145885896e-06, + "loss": 0.7501, + "step": 1163 + }, + { + "epoch": 0.5778825865706839, + "grad_norm": 0.13978700563928276, + "learning_rate": 4.754115639541489e-06, + "loss": 0.7891, + "step": 1164 + }, + { + "epoch": 0.5783790492739233, + "grad_norm": 0.1386484989822081, + "learning_rate": 4.7536927886916486e-06, + "loss": 0.8111, + "step": 1165 + }, + { + "epoch": 0.5788755119771627, + "grad_norm": 0.1352803729379626, + "learning_rate": 4.753269593401e-06, + "loss": 0.7986, + "step": 1166 + }, + { + "epoch": 0.5793719746804021, + "grad_norm": 0.14338323635053893, + "learning_rate": 4.752846053734223e-06, + "loss": 0.8083, + "step": 1167 + }, + { + "epoch": 0.5798684373836416, + "grad_norm": 0.14290830558032838, + "learning_rate": 4.752422169756048e-06, + "loss": 0.7245, + "step": 1168 + }, + { + "epoch": 0.580364900086881, + "grad_norm": 0.14435662586254663, + "learning_rate": 4.7519979415312595e-06, + "loss": 0.7937, + "step": 1169 + }, + { + "epoch": 0.5808613627901204, + "grad_norm": 0.1301445915669164, + "learning_rate": 4.751573369124693e-06, + "loss": 0.7629, + "step": 1170 + }, + { + "epoch": 0.5813578254933598, + "grad_norm": 0.14135646579813632, + "learning_rate": 4.751148452601239e-06, + "loss": 0.8281, + "step": 1171 + }, + { + "epoch": 0.5818542881965992, + "grad_norm": 0.14137941871842752, + "learning_rate": 4.750723192025839e-06, + "loss": 0.751, + "step": 1172 + }, + { + "epoch": 0.5823507508998387, + "grad_norm": 0.13846670183254522, + "learning_rate": 4.750297587463486e-06, + "loss": 0.8034, + "step": 1173 + }, + { + "epoch": 0.5828472136030781, + "grad_norm": 0.13325672340874672, + "learning_rate": 4.749871638979227e-06, + "loss": 0.6927, + "step": 1174 + }, + { + "epoch": 0.5833436763063174, + "grad_norm": 0.125743329419156, + "learning_rate": 4.749445346638163e-06, + "loss": 0.7256, + "step": 1175 + }, + { + "epoch": 0.5838401390095569, + "grad_norm": 0.1417089701135048, + "learning_rate": 4.749018710505444e-06, + "loss": 0.7831, + "step": 1176 + }, + { + "epoch": 0.5843366017127963, + "grad_norm": 0.13605194542345714, + "learning_rate": 4.748591730646276e-06, + "loss": 0.8015, + "step": 1177 + }, + { + "epoch": 0.5848330644160358, + "grad_norm": 0.13481999719592982, + "learning_rate": 4.748164407125915e-06, + "loss": 0.7615, + "step": 1178 + }, + { + "epoch": 0.5853295271192752, + "grad_norm": 0.14186151175015785, + "learning_rate": 4.747736740009671e-06, + "loss": 0.764, + "step": 1179 + }, + { + "epoch": 0.5858259898225145, + "grad_norm": 0.14184400720811477, + "learning_rate": 4.747308729362906e-06, + "loss": 0.7587, + "step": 1180 + }, + { + "epoch": 0.586322452525754, + "grad_norm": 0.1385901962211324, + "learning_rate": 4.746880375251034e-06, + "loss": 0.7336, + "step": 1181 + }, + { + "epoch": 0.5868189152289934, + "grad_norm": 0.13451407782163222, + "learning_rate": 4.7464516777395234e-06, + "loss": 0.7429, + "step": 1182 + }, + { + "epoch": 0.5873153779322329, + "grad_norm": 0.13022215035798407, + "learning_rate": 4.746022636893894e-06, + "loss": 0.7643, + "step": 1183 + }, + { + "epoch": 0.5878118406354722, + "grad_norm": 0.13690715708703163, + "learning_rate": 4.745593252779715e-06, + "loss": 0.8217, + "step": 1184 + }, + { + "epoch": 0.5883083033387116, + "grad_norm": 0.13748468118288174, + "learning_rate": 4.745163525462613e-06, + "loss": 0.7703, + "step": 1185 + }, + { + "epoch": 0.5888047660419511, + "grad_norm": 0.1405973365721223, + "learning_rate": 4.744733455008265e-06, + "loss": 0.7383, + "step": 1186 + }, + { + "epoch": 0.5893012287451905, + "grad_norm": 0.13717194937849955, + "learning_rate": 4.7443030414824e-06, + "loss": 0.7589, + "step": 1187 + }, + { + "epoch": 0.58979769144843, + "grad_norm": 0.13528229246468063, + "learning_rate": 4.743872284950799e-06, + "loss": 0.75, + "step": 1188 + }, + { + "epoch": 0.5902941541516693, + "grad_norm": 0.133425574497451, + "learning_rate": 4.743441185479298e-06, + "loss": 0.7411, + "step": 1189 + }, + { + "epoch": 0.5907906168549087, + "grad_norm": 0.1324925987986178, + "learning_rate": 4.743009743133782e-06, + "loss": 0.7685, + "step": 1190 + }, + { + "epoch": 0.5912870795581482, + "grad_norm": 0.1369066542388308, + "learning_rate": 4.742577957980191e-06, + "loss": 0.7832, + "step": 1191 + }, + { + "epoch": 0.5917835422613876, + "grad_norm": 0.13896137808156944, + "learning_rate": 4.7421458300845156e-06, + "loss": 0.7763, + "step": 1192 + }, + { + "epoch": 0.5922800049646271, + "grad_norm": 0.13714450419418717, + "learning_rate": 4.7417133595128e-06, + "loss": 0.7755, + "step": 1193 + }, + { + "epoch": 0.5927764676678664, + "grad_norm": 0.13866795158105188, + "learning_rate": 4.741280546331142e-06, + "loss": 0.7525, + "step": 1194 + }, + { + "epoch": 0.5932729303711058, + "grad_norm": 0.13821511176421178, + "learning_rate": 4.740847390605688e-06, + "loss": 0.7808, + "step": 1195 + }, + { + "epoch": 0.5937693930743453, + "grad_norm": 0.1380634473432959, + "learning_rate": 4.740413892402639e-06, + "loss": 0.7547, + "step": 1196 + }, + { + "epoch": 0.5942658557775847, + "grad_norm": 0.138297189072512, + "learning_rate": 4.73998005178825e-06, + "loss": 0.7983, + "step": 1197 + }, + { + "epoch": 0.5947623184808242, + "grad_norm": 0.13758909195503097, + "learning_rate": 4.739545868828824e-06, + "loss": 0.7437, + "step": 1198 + }, + { + "epoch": 0.5952587811840635, + "grad_norm": 0.13643426919261928, + "learning_rate": 4.739111343590722e-06, + "loss": 0.7617, + "step": 1199 + }, + { + "epoch": 0.5957552438873029, + "grad_norm": 0.13419264066031278, + "learning_rate": 4.7386764761403515e-06, + "loss": 0.7577, + "step": 1200 + }, + { + "epoch": 0.5962517065905424, + "grad_norm": 0.1458962881146663, + "learning_rate": 4.738241266544176e-06, + "loss": 0.7542, + "step": 1201 + }, + { + "epoch": 0.5967481692937818, + "grad_norm": 0.13326276596473177, + "learning_rate": 4.737805714868711e-06, + "loss": 0.7639, + "step": 1202 + }, + { + "epoch": 0.5972446319970213, + "grad_norm": 0.1327575012050733, + "learning_rate": 4.737369821180522e-06, + "loss": 0.7815, + "step": 1203 + }, + { + "epoch": 0.5977410947002606, + "grad_norm": 0.13976077843344958, + "learning_rate": 4.736933585546229e-06, + "loss": 0.771, + "step": 1204 + }, + { + "epoch": 0.5982375574035, + "grad_norm": 0.1448357997256506, + "learning_rate": 4.736497008032505e-06, + "loss": 0.797, + "step": 1205 + }, + { + "epoch": 0.5987340201067395, + "grad_norm": 0.14152292832314567, + "learning_rate": 4.7360600887060735e-06, + "loss": 0.8046, + "step": 1206 + }, + { + "epoch": 0.5992304828099789, + "grad_norm": 0.13983648625283432, + "learning_rate": 4.735622827633709e-06, + "loss": 0.7767, + "step": 1207 + }, + { + "epoch": 0.5997269455132184, + "grad_norm": 0.13723445239630844, + "learning_rate": 4.7351852248822405e-06, + "loss": 0.7898, + "step": 1208 + }, + { + "epoch": 0.6002234082164577, + "grad_norm": 0.14040576284527934, + "learning_rate": 4.734747280518549e-06, + "loss": 0.7507, + "step": 1209 + }, + { + "epoch": 0.6007198709196971, + "grad_norm": 0.13281149215833118, + "learning_rate": 4.734308994609568e-06, + "loss": 0.7344, + "step": 1210 + }, + { + "epoch": 0.6012163336229366, + "grad_norm": 0.14696680910440754, + "learning_rate": 4.73387036722228e-06, + "loss": 0.8057, + "step": 1211 + }, + { + "epoch": 0.601712796326176, + "grad_norm": 0.14656238080300293, + "learning_rate": 4.733431398423725e-06, + "loss": 0.7715, + "step": 1212 + }, + { + "epoch": 0.6022092590294155, + "grad_norm": 0.13640813199119411, + "learning_rate": 4.732992088280991e-06, + "loss": 0.7887, + "step": 1213 + }, + { + "epoch": 0.6027057217326548, + "grad_norm": 0.14416087368058042, + "learning_rate": 4.73255243686122e-06, + "loss": 0.7845, + "step": 1214 + }, + { + "epoch": 0.6032021844358942, + "grad_norm": 0.1448460344463032, + "learning_rate": 4.732112444231604e-06, + "loss": 0.7858, + "step": 1215 + }, + { + "epoch": 0.6036986471391337, + "grad_norm": 0.14284369655429804, + "learning_rate": 4.731672110459391e-06, + "loss": 0.7595, + "step": 1216 + }, + { + "epoch": 0.6041951098423731, + "grad_norm": 0.13333186060643346, + "learning_rate": 4.7312314356118774e-06, + "loss": 0.7787, + "step": 1217 + }, + { + "epoch": 0.6046915725456126, + "grad_norm": 0.13448997562242113, + "learning_rate": 4.7307904197564146e-06, + "loss": 0.7907, + "step": 1218 + }, + { + "epoch": 0.6051880352488519, + "grad_norm": 0.13856754170442995, + "learning_rate": 4.730349062960405e-06, + "loss": 0.703, + "step": 1219 + }, + { + "epoch": 0.6056844979520913, + "grad_norm": 0.13197537382149402, + "learning_rate": 4.7299073652912995e-06, + "loss": 0.7698, + "step": 1220 + }, + { + "epoch": 0.6061809606553308, + "grad_norm": 0.1308991610627273, + "learning_rate": 4.729465326816609e-06, + "loss": 0.7331, + "step": 1221 + }, + { + "epoch": 0.6066774233585702, + "grad_norm": 0.13827465076211878, + "learning_rate": 4.72902294760389e-06, + "loss": 0.8255, + "step": 1222 + }, + { + "epoch": 0.6071738860618096, + "grad_norm": 0.14152077690894366, + "learning_rate": 4.7285802277207525e-06, + "loss": 0.7757, + "step": 1223 + }, + { + "epoch": 0.607670348765049, + "grad_norm": 0.13327976935901228, + "learning_rate": 4.72813716723486e-06, + "loss": 0.7834, + "step": 1224 + }, + { + "epoch": 0.6081668114682884, + "grad_norm": 0.14560592025153601, + "learning_rate": 4.727693766213927e-06, + "loss": 0.8553, + "step": 1225 + }, + { + "epoch": 0.6086632741715279, + "grad_norm": 0.139357875506195, + "learning_rate": 4.72725002472572e-06, + "loss": 0.7334, + "step": 1226 + }, + { + "epoch": 0.6091597368747673, + "grad_norm": 0.1414789031418474, + "learning_rate": 4.726805942838058e-06, + "loss": 0.7975, + "step": 1227 + }, + { + "epoch": 0.6096561995780067, + "grad_norm": 0.13771948814756751, + "learning_rate": 4.726361520618812e-06, + "loss": 0.738, + "step": 1228 + }, + { + "epoch": 0.6101526622812461, + "grad_norm": 0.12954626047367315, + "learning_rate": 4.725916758135905e-06, + "loss": 0.7446, + "step": 1229 + }, + { + "epoch": 0.6106491249844855, + "grad_norm": 0.1321303236295978, + "learning_rate": 4.72547165545731e-06, + "loss": 0.7225, + "step": 1230 + }, + { + "epoch": 0.611145587687725, + "grad_norm": 0.13934783137333412, + "learning_rate": 4.725026212651056e-06, + "loss": 0.7504, + "step": 1231 + }, + { + "epoch": 0.6116420503909644, + "grad_norm": 0.1387209464213516, + "learning_rate": 4.72458042978522e-06, + "loss": 0.779, + "step": 1232 + }, + { + "epoch": 0.6121385130942038, + "grad_norm": 0.1417563704806412, + "learning_rate": 4.7241343069279355e-06, + "loss": 0.7727, + "step": 1233 + }, + { + "epoch": 0.6126349757974432, + "grad_norm": 0.1420634383737698, + "learning_rate": 4.723687844147383e-06, + "loss": 0.8045, + "step": 1234 + }, + { + "epoch": 0.6131314385006826, + "grad_norm": 0.142217432211696, + "learning_rate": 4.723241041511797e-06, + "loss": 0.7966, + "step": 1235 + }, + { + "epoch": 0.6136279012039221, + "grad_norm": 0.13955237043987861, + "learning_rate": 4.722793899089465e-06, + "loss": 0.763, + "step": 1236 + }, + { + "epoch": 0.6141243639071615, + "grad_norm": 0.1383129773036087, + "learning_rate": 4.7223464169487255e-06, + "loss": 0.7865, + "step": 1237 + }, + { + "epoch": 0.6146208266104008, + "grad_norm": 0.1318995281229908, + "learning_rate": 4.721898595157969e-06, + "loss": 0.7592, + "step": 1238 + }, + { + "epoch": 0.6151172893136403, + "grad_norm": 0.13409755823637834, + "learning_rate": 4.721450433785637e-06, + "loss": 0.7349, + "step": 1239 + }, + { + "epoch": 0.6156137520168797, + "grad_norm": 0.13477979687672653, + "learning_rate": 4.721001932900224e-06, + "loss": 0.7168, + "step": 1240 + }, + { + "epoch": 0.6161102147201192, + "grad_norm": 0.1339375435013451, + "learning_rate": 4.720553092570278e-06, + "loss": 0.7679, + "step": 1241 + }, + { + "epoch": 0.6166066774233586, + "grad_norm": 0.13714226350401248, + "learning_rate": 4.720103912864395e-06, + "loss": 0.8179, + "step": 1242 + }, + { + "epoch": 0.617103140126598, + "grad_norm": 0.1495165323744844, + "learning_rate": 4.719654393851225e-06, + "loss": 0.8059, + "step": 1243 + }, + { + "epoch": 0.6175996028298374, + "grad_norm": 0.13210522151583382, + "learning_rate": 4.719204535599472e-06, + "loss": 0.7327, + "step": 1244 + }, + { + "epoch": 0.6180960655330768, + "grad_norm": 0.135327563201833, + "learning_rate": 4.718754338177887e-06, + "loss": 0.7884, + "step": 1245 + }, + { + "epoch": 0.6185925282363163, + "grad_norm": 0.1493290671655753, + "learning_rate": 4.7183038016552765e-06, + "loss": 0.8058, + "step": 1246 + }, + { + "epoch": 0.6190889909395557, + "grad_norm": 0.13731834920199315, + "learning_rate": 4.717852926100497e-06, + "loss": 0.7525, + "step": 1247 + }, + { + "epoch": 0.619585453642795, + "grad_norm": 0.13121903632834106, + "learning_rate": 4.717401711582459e-06, + "loss": 0.748, + "step": 1248 + }, + { + "epoch": 0.6200819163460345, + "grad_norm": 0.13357953738588796, + "learning_rate": 4.716950158170123e-06, + "loss": 0.7191, + "step": 1249 + }, + { + "epoch": 0.6205783790492739, + "grad_norm": 0.1333298023226559, + "learning_rate": 4.716498265932501e-06, + "loss": 0.7536, + "step": 1250 + }, + { + "epoch": 0.6210748417525134, + "grad_norm": 0.13600946084924598, + "learning_rate": 4.7160460349386575e-06, + "loss": 0.7327, + "step": 1251 + }, + { + "epoch": 0.6215713044557528, + "grad_norm": 0.15789147253069744, + "learning_rate": 4.7155934652577095e-06, + "loss": 0.7542, + "step": 1252 + }, + { + "epoch": 0.6220677671589921, + "grad_norm": 0.14117884230963118, + "learning_rate": 4.7151405569588245e-06, + "loss": 0.7566, + "step": 1253 + }, + { + "epoch": 0.6225642298622316, + "grad_norm": 0.1434269696587236, + "learning_rate": 4.714687310111224e-06, + "loss": 0.7661, + "step": 1254 + }, + { + "epoch": 0.623060692565471, + "grad_norm": 0.13233837304493712, + "learning_rate": 4.714233724784176e-06, + "loss": 0.7387, + "step": 1255 + }, + { + "epoch": 0.6235571552687105, + "grad_norm": 0.13451446763955208, + "learning_rate": 4.713779801047006e-06, + "loss": 0.7593, + "step": 1256 + }, + { + "epoch": 0.6240536179719498, + "grad_norm": 0.14045733263477267, + "learning_rate": 4.713325538969089e-06, + "loss": 0.7588, + "step": 1257 + }, + { + "epoch": 0.6245500806751892, + "grad_norm": 0.13811994049159718, + "learning_rate": 4.7128709386198516e-06, + "loss": 0.7849, + "step": 1258 + }, + { + "epoch": 0.6250465433784287, + "grad_norm": 0.13661650047767832, + "learning_rate": 4.712416000068771e-06, + "loss": 0.7871, + "step": 1259 + }, + { + "epoch": 0.6255430060816681, + "grad_norm": 0.13356087506731615, + "learning_rate": 4.7119607233853795e-06, + "loss": 0.6997, + "step": 1260 + }, + { + "epoch": 0.6260394687849076, + "grad_norm": 0.14326475290125062, + "learning_rate": 4.7115051086392575e-06, + "loss": 0.7913, + "step": 1261 + }, + { + "epoch": 0.6265359314881469, + "grad_norm": 0.14111338297951223, + "learning_rate": 4.711049155900037e-06, + "loss": 0.775, + "step": 1262 + }, + { + "epoch": 0.6270323941913863, + "grad_norm": 0.1384448111488101, + "learning_rate": 4.710592865237406e-06, + "loss": 0.7453, + "step": 1263 + }, + { + "epoch": 0.6275288568946258, + "grad_norm": 0.13688056535276805, + "learning_rate": 4.710136236721099e-06, + "loss": 0.7835, + "step": 1264 + }, + { + "epoch": 0.6280253195978652, + "grad_norm": 0.1369014193152322, + "learning_rate": 4.709679270420905e-06, + "loss": 0.7708, + "step": 1265 + }, + { + "epoch": 0.6285217823011047, + "grad_norm": 0.13619728251726942, + "learning_rate": 4.709221966406664e-06, + "loss": 0.7593, + "step": 1266 + }, + { + "epoch": 0.629018245004344, + "grad_norm": 0.13341268255654185, + "learning_rate": 4.7087643247482675e-06, + "loss": 0.7857, + "step": 1267 + }, + { + "epoch": 0.6295147077075834, + "grad_norm": 0.1396936626784199, + "learning_rate": 4.70830634551566e-06, + "loss": 0.7934, + "step": 1268 + }, + { + "epoch": 0.6300111704108229, + "grad_norm": 0.13784257345636344, + "learning_rate": 4.7078480287788335e-06, + "loss": 0.8105, + "step": 1269 + }, + { + "epoch": 0.6305076331140623, + "grad_norm": 0.14024550205782157, + "learning_rate": 4.707389374607837e-06, + "loss": 0.7744, + "step": 1270 + }, + { + "epoch": 0.6310040958173018, + "grad_norm": 0.1331894256857835, + "learning_rate": 4.7069303830727665e-06, + "loss": 0.7454, + "step": 1271 + }, + { + "epoch": 0.6315005585205411, + "grad_norm": 0.13774853065612155, + "learning_rate": 4.706471054243773e-06, + "loss": 0.7572, + "step": 1272 + }, + { + "epoch": 0.6319970212237805, + "grad_norm": 0.13274604497668133, + "learning_rate": 4.7060113881910565e-06, + "loss": 0.8029, + "step": 1273 + }, + { + "epoch": 0.63249348392702, + "grad_norm": 0.13007067612592235, + "learning_rate": 4.705551384984871e-06, + "loss": 0.7329, + "step": 1274 + }, + { + "epoch": 0.6329899466302594, + "grad_norm": 0.1348605302563776, + "learning_rate": 4.705091044695519e-06, + "loss": 0.7854, + "step": 1275 + }, + { + "epoch": 0.6334864093334989, + "grad_norm": 0.13943981892113053, + "learning_rate": 4.7046303673933566e-06, + "loss": 0.7621, + "step": 1276 + }, + { + "epoch": 0.6339828720367382, + "grad_norm": 0.13080686345224168, + "learning_rate": 4.7041693531487905e-06, + "loss": 0.7511, + "step": 1277 + }, + { + "epoch": 0.6344793347399776, + "grad_norm": 0.13367213440770978, + "learning_rate": 4.703708002032281e-06, + "loss": 0.8208, + "step": 1278 + }, + { + "epoch": 0.6349757974432171, + "grad_norm": 0.1338922082718588, + "learning_rate": 4.703246314114337e-06, + "loss": 0.7583, + "step": 1279 + }, + { + "epoch": 0.6354722601464565, + "grad_norm": 0.13384133583383315, + "learning_rate": 4.702784289465521e-06, + "loss": 0.7247, + "step": 1280 + }, + { + "epoch": 0.635968722849696, + "grad_norm": 0.14316701050668065, + "learning_rate": 4.702321928156443e-06, + "loss": 0.7769, + "step": 1281 + }, + { + "epoch": 0.6364651855529353, + "grad_norm": 0.1363830874610169, + "learning_rate": 4.701859230257772e-06, + "loss": 0.7508, + "step": 1282 + }, + { + "epoch": 0.6369616482561747, + "grad_norm": 0.13044310663712724, + "learning_rate": 4.701396195840221e-06, + "loss": 0.6918, + "step": 1283 + }, + { + "epoch": 0.6374581109594142, + "grad_norm": 0.14799662097345562, + "learning_rate": 4.700932824974558e-06, + "loss": 0.8122, + "step": 1284 + }, + { + "epoch": 0.6379545736626536, + "grad_norm": 0.13719379503307672, + "learning_rate": 4.700469117731602e-06, + "loss": 0.7589, + "step": 1285 + }, + { + "epoch": 0.6384510363658931, + "grad_norm": 0.15005736439386755, + "learning_rate": 4.700005074182223e-06, + "loss": 0.7942, + "step": 1286 + }, + { + "epoch": 0.6389474990691324, + "grad_norm": 0.13361765649422278, + "learning_rate": 4.699540694397344e-06, + "loss": 0.7418, + "step": 1287 + }, + { + "epoch": 0.6394439617723718, + "grad_norm": 0.12927575638530414, + "learning_rate": 4.699075978447936e-06, + "loss": 0.7314, + "step": 1288 + }, + { + "epoch": 0.6399404244756113, + "grad_norm": 0.13136950718329457, + "learning_rate": 4.698610926405024e-06, + "loss": 0.7591, + "step": 1289 + }, + { + "epoch": 0.6404368871788507, + "grad_norm": 0.1404280522487197, + "learning_rate": 4.6981455383396845e-06, + "loss": 0.8238, + "step": 1290 + }, + { + "epoch": 0.64093334988209, + "grad_norm": 0.1303989293917867, + "learning_rate": 4.697679814323044e-06, + "loss": 0.7489, + "step": 1291 + }, + { + "epoch": 0.6414298125853295, + "grad_norm": 0.13338423096237687, + "learning_rate": 4.69721375442628e-06, + "loss": 0.7306, + "step": 1292 + }, + { + "epoch": 0.6419262752885689, + "grad_norm": 0.13305102973746538, + "learning_rate": 4.696747358720624e-06, + "loss": 0.7886, + "step": 1293 + }, + { + "epoch": 0.6424227379918084, + "grad_norm": 0.13820820437935433, + "learning_rate": 4.696280627277356e-06, + "loss": 0.7081, + "step": 1294 + }, + { + "epoch": 0.6429192006950478, + "grad_norm": 0.13801203340059798, + "learning_rate": 4.695813560167809e-06, + "loss": 0.7402, + "step": 1295 + }, + { + "epoch": 0.6434156633982872, + "grad_norm": 0.13519623515883733, + "learning_rate": 4.695346157463367e-06, + "loss": 0.7847, + "step": 1296 + }, + { + "epoch": 0.6439121261015266, + "grad_norm": 0.13214986252403507, + "learning_rate": 4.6948784192354645e-06, + "loss": 0.7734, + "step": 1297 + }, + { + "epoch": 0.644408588804766, + "grad_norm": 0.1378462366811087, + "learning_rate": 4.694410345555588e-06, + "loss": 0.6981, + "step": 1298 + }, + { + "epoch": 0.6449050515080055, + "grad_norm": 0.1366789326165814, + "learning_rate": 4.6939419364952734e-06, + "loss": 0.7557, + "step": 1299 + }, + { + "epoch": 0.6454015142112449, + "grad_norm": 0.14607757273947794, + "learning_rate": 4.693473192126112e-06, + "loss": 0.7782, + "step": 1300 + }, + { + "epoch": 0.6458979769144843, + "grad_norm": 0.13866618330491343, + "learning_rate": 4.693004112519743e-06, + "loss": 0.7827, + "step": 1301 + }, + { + "epoch": 0.6463944396177237, + "grad_norm": 0.13908697250264154, + "learning_rate": 4.692534697747858e-06, + "loss": 0.7945, + "step": 1302 + }, + { + "epoch": 0.6468909023209631, + "grad_norm": 0.13233872664417434, + "learning_rate": 4.692064947882198e-06, + "loss": 0.786, + "step": 1303 + }, + { + "epoch": 0.6473873650242026, + "grad_norm": 0.13187870756426126, + "learning_rate": 4.6915948629945585e-06, + "loss": 0.6941, + "step": 1304 + }, + { + "epoch": 0.647883827727442, + "grad_norm": 0.13132502035875618, + "learning_rate": 4.691124443156784e-06, + "loss": 0.7943, + "step": 1305 + }, + { + "epoch": 0.6483802904306813, + "grad_norm": 0.1473223707369921, + "learning_rate": 4.690653688440769e-06, + "loss": 0.758, + "step": 1306 + }, + { + "epoch": 0.6488767531339208, + "grad_norm": 0.1374434003262379, + "learning_rate": 4.6901825989184634e-06, + "loss": 0.7477, + "step": 1307 + }, + { + "epoch": 0.6493732158371602, + "grad_norm": 0.13383271780435643, + "learning_rate": 4.689711174661864e-06, + "loss": 0.7669, + "step": 1308 + }, + { + "epoch": 0.6498696785403997, + "grad_norm": 0.13444066067651259, + "learning_rate": 4.689239415743021e-06, + "loss": 0.7757, + "step": 1309 + }, + { + "epoch": 0.6503661412436391, + "grad_norm": 0.12857578926705554, + "learning_rate": 4.688767322234035e-06, + "loss": 0.7442, + "step": 1310 + }, + { + "epoch": 0.6508626039468784, + "grad_norm": 0.13406381840115994, + "learning_rate": 4.688294894207058e-06, + "loss": 0.7598, + "step": 1311 + }, + { + "epoch": 0.6513590666501179, + "grad_norm": 0.14039516549020117, + "learning_rate": 4.687822131734293e-06, + "loss": 0.7842, + "step": 1312 + }, + { + "epoch": 0.6518555293533573, + "grad_norm": 0.1374250686491604, + "learning_rate": 4.687349034887994e-06, + "loss": 0.7636, + "step": 1313 + }, + { + "epoch": 0.6523519920565968, + "grad_norm": 0.13288394681388876, + "learning_rate": 4.686875603740467e-06, + "loss": 0.7648, + "step": 1314 + }, + { + "epoch": 0.6528484547598362, + "grad_norm": 0.15493904800962308, + "learning_rate": 4.686401838364069e-06, + "loss": 0.7857, + "step": 1315 + }, + { + "epoch": 0.6533449174630755, + "grad_norm": 0.1592114663190787, + "learning_rate": 4.685927738831204e-06, + "loss": 0.7933, + "step": 1316 + }, + { + "epoch": 0.653841380166315, + "grad_norm": 0.138465643640585, + "learning_rate": 4.685453305214335e-06, + "loss": 0.7698, + "step": 1317 + }, + { + "epoch": 0.6543378428695544, + "grad_norm": 0.1379981991943177, + "learning_rate": 4.684978537585968e-06, + "loss": 0.7459, + "step": 1318 + }, + { + "epoch": 0.6548343055727939, + "grad_norm": 0.14481568247936927, + "learning_rate": 4.684503436018664e-06, + "loss": 0.7588, + "step": 1319 + }, + { + "epoch": 0.6553307682760333, + "grad_norm": 0.14210563691826714, + "learning_rate": 4.684028000585038e-06, + "loss": 0.7329, + "step": 1320 + }, + { + "epoch": 0.6558272309792726, + "grad_norm": 0.12867626236605553, + "learning_rate": 4.683552231357749e-06, + "loss": 0.7337, + "step": 1321 + }, + { + "epoch": 0.6563236936825121, + "grad_norm": 0.1376861133072269, + "learning_rate": 4.683076128409512e-06, + "loss": 0.7972, + "step": 1322 + }, + { + "epoch": 0.6568201563857515, + "grad_norm": 0.14125134793520855, + "learning_rate": 4.682599691813092e-06, + "loss": 0.7393, + "step": 1323 + }, + { + "epoch": 0.657316619088991, + "grad_norm": 0.14276487162583956, + "learning_rate": 4.682122921641305e-06, + "loss": 0.7879, + "step": 1324 + }, + { + "epoch": 0.6578130817922304, + "grad_norm": 0.13468493472362367, + "learning_rate": 4.681645817967017e-06, + "loss": 0.7335, + "step": 1325 + }, + { + "epoch": 0.6583095444954697, + "grad_norm": 0.13943793893639297, + "learning_rate": 4.681168380863145e-06, + "loss": 0.758, + "step": 1326 + }, + { + "epoch": 0.6588060071987092, + "grad_norm": 0.13951539561696433, + "learning_rate": 4.680690610402659e-06, + "loss": 0.8151, + "step": 1327 + }, + { + "epoch": 0.6593024699019486, + "grad_norm": 0.13406494700163638, + "learning_rate": 4.6802125066585765e-06, + "loss": 0.7013, + "step": 1328 + }, + { + "epoch": 0.6597989326051881, + "grad_norm": 0.14160309961844755, + "learning_rate": 4.679734069703971e-06, + "loss": 0.7574, + "step": 1329 + }, + { + "epoch": 0.6602953953084274, + "grad_norm": 0.13446373370464942, + "learning_rate": 4.679255299611961e-06, + "loss": 0.7507, + "step": 1330 + }, + { + "epoch": 0.6607918580116668, + "grad_norm": 0.12942976744487117, + "learning_rate": 4.678776196455722e-06, + "loss": 0.7311, + "step": 1331 + }, + { + "epoch": 0.6612883207149063, + "grad_norm": 0.1391321723122554, + "learning_rate": 4.678296760308474e-06, + "loss": 0.7981, + "step": 1332 + }, + { + "epoch": 0.6617847834181457, + "grad_norm": 0.14428783420102687, + "learning_rate": 4.677816991243493e-06, + "loss": 0.8215, + "step": 1333 + }, + { + "epoch": 0.6622812461213852, + "grad_norm": 0.1394878539696028, + "learning_rate": 4.677336889334103e-06, + "loss": 0.7721, + "step": 1334 + }, + { + "epoch": 0.6627777088246245, + "grad_norm": 0.15266983844158868, + "learning_rate": 4.676856454653681e-06, + "loss": 0.7336, + "step": 1335 + }, + { + "epoch": 0.6632741715278639, + "grad_norm": 0.14455346704491276, + "learning_rate": 4.676375687275653e-06, + "loss": 0.7278, + "step": 1336 + }, + { + "epoch": 0.6637706342311034, + "grad_norm": 0.1316229836144885, + "learning_rate": 4.675894587273496e-06, + "loss": 0.7647, + "step": 1337 + }, + { + "epoch": 0.6642670969343428, + "grad_norm": 0.12941666247842312, + "learning_rate": 4.6754131547207406e-06, + "loss": 0.7125, + "step": 1338 + }, + { + "epoch": 0.6647635596375823, + "grad_norm": 0.14046373964383968, + "learning_rate": 4.674931389690963e-06, + "loss": 0.7341, + "step": 1339 + }, + { + "epoch": 0.6652600223408216, + "grad_norm": 0.13645194981403033, + "learning_rate": 4.674449292257796e-06, + "loss": 0.7676, + "step": 1340 + }, + { + "epoch": 0.665756485044061, + "grad_norm": 0.14519132705834006, + "learning_rate": 4.6739668624949196e-06, + "loss": 0.7952, + "step": 1341 + }, + { + "epoch": 0.6662529477473005, + "grad_norm": 0.13696033048575837, + "learning_rate": 4.6734841004760644e-06, + "loss": 0.7269, + "step": 1342 + }, + { + "epoch": 0.6667494104505399, + "grad_norm": 0.134676078421444, + "learning_rate": 4.673001006275013e-06, + "loss": 0.7453, + "step": 1343 + }, + { + "epoch": 0.6672458731537794, + "grad_norm": 0.14578953183512341, + "learning_rate": 4.672517579965601e-06, + "loss": 0.8015, + "step": 1344 + }, + { + "epoch": 0.6677423358570187, + "grad_norm": 0.13466978301237612, + "learning_rate": 4.6720338216217096e-06, + "loss": 0.7375, + "step": 1345 + }, + { + "epoch": 0.6682387985602581, + "grad_norm": 0.13431133998664943, + "learning_rate": 4.671549731317274e-06, + "loss": 0.7931, + "step": 1346 + }, + { + "epoch": 0.6687352612634976, + "grad_norm": 0.14300934227788512, + "learning_rate": 4.67106530912628e-06, + "loss": 0.8223, + "step": 1347 + }, + { + "epoch": 0.669231723966737, + "grad_norm": 0.1335079638925206, + "learning_rate": 4.670580555122765e-06, + "loss": 0.7081, + "step": 1348 + }, + { + "epoch": 0.6697281866699765, + "grad_norm": 0.13925505067882116, + "learning_rate": 4.670095469380814e-06, + "loss": 0.7556, + "step": 1349 + }, + { + "epoch": 0.6702246493732158, + "grad_norm": 0.1385402073066566, + "learning_rate": 4.669610051974566e-06, + "loss": 0.7862, + "step": 1350 + }, + { + "epoch": 0.6707211120764552, + "grad_norm": 0.14168635102257782, + "learning_rate": 4.669124302978208e-06, + "loss": 0.7532, + "step": 1351 + }, + { + "epoch": 0.6712175747796947, + "grad_norm": 0.1319645107321289, + "learning_rate": 4.6686382224659795e-06, + "loss": 0.721, + "step": 1352 + }, + { + "epoch": 0.6717140374829341, + "grad_norm": 0.14159994069821713, + "learning_rate": 4.66815181051217e-06, + "loss": 0.7324, + "step": 1353 + }, + { + "epoch": 0.6722105001861736, + "grad_norm": 0.13302384063587522, + "learning_rate": 4.667665067191121e-06, + "loss": 0.7914, + "step": 1354 + }, + { + "epoch": 0.6727069628894129, + "grad_norm": 0.14520590994816845, + "learning_rate": 4.667177992577222e-06, + "loss": 0.803, + "step": 1355 + }, + { + "epoch": 0.6732034255926523, + "grad_norm": 0.13258982595212945, + "learning_rate": 4.666690586744914e-06, + "loss": 0.7802, + "step": 1356 + }, + { + "epoch": 0.6736998882958918, + "grad_norm": 0.14243373706488904, + "learning_rate": 4.666202849768691e-06, + "loss": 0.7915, + "step": 1357 + }, + { + "epoch": 0.6741963509991312, + "grad_norm": 0.14197795066279506, + "learning_rate": 4.6657147817230945e-06, + "loss": 0.7584, + "step": 1358 + }, + { + "epoch": 0.6746928137023707, + "grad_norm": 0.13496558186041357, + "learning_rate": 4.665226382682718e-06, + "loss": 0.735, + "step": 1359 + }, + { + "epoch": 0.67518927640561, + "grad_norm": 0.13833577662827512, + "learning_rate": 4.664737652722205e-06, + "loss": 0.7653, + "step": 1360 + }, + { + "epoch": 0.6756857391088494, + "grad_norm": 0.13481054586272334, + "learning_rate": 4.664248591916252e-06, + "loss": 0.7518, + "step": 1361 + }, + { + "epoch": 0.6761822018120889, + "grad_norm": 0.13380123421346032, + "learning_rate": 4.663759200339603e-06, + "loss": 0.742, + "step": 1362 + }, + { + "epoch": 0.6766786645153283, + "grad_norm": 0.13210724903338833, + "learning_rate": 4.663269478067053e-06, + "loss": 0.7546, + "step": 1363 + }, + { + "epoch": 0.6771751272185677, + "grad_norm": 0.13236042360384812, + "learning_rate": 4.6627794251734485e-06, + "loss": 0.7404, + "step": 1364 + }, + { + "epoch": 0.6776715899218071, + "grad_norm": 0.13755480780239387, + "learning_rate": 4.662289041733686e-06, + "loss": 0.7639, + "step": 1365 + }, + { + "epoch": 0.6781680526250465, + "grad_norm": 0.13277407792473037, + "learning_rate": 4.661798327822713e-06, + "loss": 0.78, + "step": 1366 + }, + { + "epoch": 0.678664515328286, + "grad_norm": 0.13349951689576497, + "learning_rate": 4.661307283515528e-06, + "loss": 0.7362, + "step": 1367 + }, + { + "epoch": 0.6791609780315254, + "grad_norm": 0.1454330089242611, + "learning_rate": 4.660815908887179e-06, + "loss": 0.786, + "step": 1368 + }, + { + "epoch": 0.6796574407347648, + "grad_norm": 0.13805559122664302, + "learning_rate": 4.660324204012764e-06, + "loss": 0.7168, + "step": 1369 + }, + { + "epoch": 0.6801539034380042, + "grad_norm": 0.1390252829777148, + "learning_rate": 4.659832168967432e-06, + "loss": 0.736, + "step": 1370 + }, + { + "epoch": 0.6806503661412436, + "grad_norm": 0.13618470081629033, + "learning_rate": 4.659339803826384e-06, + "loss": 0.7786, + "step": 1371 + }, + { + "epoch": 0.6811468288444831, + "grad_norm": 0.13932005962814825, + "learning_rate": 4.658847108664869e-06, + "loss": 0.7333, + "step": 1372 + }, + { + "epoch": 0.6816432915477225, + "grad_norm": 0.1357691607502142, + "learning_rate": 4.6583540835581885e-06, + "loss": 0.7899, + "step": 1373 + }, + { + "epoch": 0.6821397542509618, + "grad_norm": 0.1464322972528331, + "learning_rate": 4.657860728581692e-06, + "loss": 0.8177, + "step": 1374 + }, + { + "epoch": 0.6826362169542013, + "grad_norm": 0.13218140620942265, + "learning_rate": 4.657367043810783e-06, + "loss": 0.7308, + "step": 1375 + }, + { + "epoch": 0.6831326796574407, + "grad_norm": 0.1461276762453906, + "learning_rate": 4.656873029320911e-06, + "loss": 0.7628, + "step": 1376 + }, + { + "epoch": 0.6836291423606802, + "grad_norm": 0.13682220089172875, + "learning_rate": 4.656378685187579e-06, + "loss": 0.8118, + "step": 1377 + }, + { + "epoch": 0.6841256050639196, + "grad_norm": 0.13808113028870317, + "learning_rate": 4.655884011486341e-06, + "loss": 0.778, + "step": 1378 + }, + { + "epoch": 0.684622067767159, + "grad_norm": 0.13141352149483548, + "learning_rate": 4.655389008292798e-06, + "loss": 0.7884, + "step": 1379 + }, + { + "epoch": 0.6851185304703984, + "grad_norm": 0.13773832084927776, + "learning_rate": 4.654893675682605e-06, + "loss": 0.7103, + "step": 1380 + }, + { + "epoch": 0.6856149931736378, + "grad_norm": 0.139138670836966, + "learning_rate": 4.654398013731464e-06, + "loss": 0.7643, + "step": 1381 + }, + { + "epoch": 0.6861114558768773, + "grad_norm": 0.13544047173805013, + "learning_rate": 4.65390202251513e-06, + "loss": 0.7806, + "step": 1382 + }, + { + "epoch": 0.6866079185801167, + "grad_norm": 0.13728555567097042, + "learning_rate": 4.653405702109407e-06, + "loss": 0.7961, + "step": 1383 + }, + { + "epoch": 0.687104381283356, + "grad_norm": 0.13134136227546692, + "learning_rate": 4.65290905259015e-06, + "loss": 0.7708, + "step": 1384 + }, + { + "epoch": 0.6876008439865955, + "grad_norm": 0.14072716822510162, + "learning_rate": 4.652412074033263e-06, + "loss": 0.7598, + "step": 1385 + }, + { + "epoch": 0.6880973066898349, + "grad_norm": 0.18508181188821365, + "learning_rate": 4.651914766514703e-06, + "loss": 0.7481, + "step": 1386 + }, + { + "epoch": 0.6885937693930744, + "grad_norm": 0.13447285883304055, + "learning_rate": 4.651417130110473e-06, + "loss": 0.7048, + "step": 1387 + }, + { + "epoch": 0.6890902320963138, + "grad_norm": 0.13797950483454202, + "learning_rate": 4.65091916489663e-06, + "loss": 0.7958, + "step": 1388 + }, + { + "epoch": 0.6895866947995531, + "grad_norm": 0.14969109022454788, + "learning_rate": 4.65042087094928e-06, + "loss": 0.8282, + "step": 1389 + }, + { + "epoch": 0.6900831575027926, + "grad_norm": 0.13393294593783833, + "learning_rate": 4.64992224834458e-06, + "loss": 0.6999, + "step": 1390 + }, + { + "epoch": 0.690579620206032, + "grad_norm": 0.13957414144835012, + "learning_rate": 4.649423297158736e-06, + "loss": 0.8215, + "step": 1391 + }, + { + "epoch": 0.6910760829092715, + "grad_norm": 0.13264027359412842, + "learning_rate": 4.648924017468003e-06, + "loss": 0.7282, + "step": 1392 + }, + { + "epoch": 0.6915725456125109, + "grad_norm": 0.13318683074105853, + "learning_rate": 4.648424409348691e-06, + "loss": 0.7237, + "step": 1393 + }, + { + "epoch": 0.6920690083157502, + "grad_norm": 0.14079961861355597, + "learning_rate": 4.647924472877154e-06, + "loss": 0.7589, + "step": 1394 + }, + { + "epoch": 0.6925654710189897, + "grad_norm": 0.13173357758319654, + "learning_rate": 4.647424208129801e-06, + "loss": 0.7327, + "step": 1395 + }, + { + "epoch": 0.6930619337222291, + "grad_norm": 0.13990361296789605, + "learning_rate": 4.646923615183089e-06, + "loss": 0.7952, + "step": 1396 + }, + { + "epoch": 0.6935583964254686, + "grad_norm": 0.13139251687364273, + "learning_rate": 4.646422694113526e-06, + "loss": 0.7317, + "step": 1397 + }, + { + "epoch": 0.6940548591287079, + "grad_norm": 0.14426243953892712, + "learning_rate": 4.645921444997669e-06, + "loss": 0.807, + "step": 1398 + }, + { + "epoch": 0.6945513218319473, + "grad_norm": 0.13812024833123682, + "learning_rate": 4.645419867912127e-06, + "loss": 0.7351, + "step": 1399 + }, + { + "epoch": 0.6950477845351868, + "grad_norm": 0.1382819086009079, + "learning_rate": 4.644917962933558e-06, + "loss": 0.7554, + "step": 1400 + }, + { + "epoch": 0.6955442472384262, + "grad_norm": 0.13000958747168673, + "learning_rate": 4.644415730138669e-06, + "loss": 0.7277, + "step": 1401 + }, + { + "epoch": 0.6960407099416657, + "grad_norm": 0.14153919488215605, + "learning_rate": 4.643913169604218e-06, + "loss": 0.8176, + "step": 1402 + }, + { + "epoch": 0.696537172644905, + "grad_norm": 0.13514290656512232, + "learning_rate": 4.643410281407014e-06, + "loss": 0.7458, + "step": 1403 + }, + { + "epoch": 0.6970336353481444, + "grad_norm": 0.1323956462686169, + "learning_rate": 4.642907065623916e-06, + "loss": 0.7801, + "step": 1404 + }, + { + "epoch": 0.6975300980513839, + "grad_norm": 0.13517969708499833, + "learning_rate": 4.642403522331832e-06, + "loss": 0.7493, + "step": 1405 + }, + { + "epoch": 0.6980265607546233, + "grad_norm": 0.14025387703072958, + "learning_rate": 4.641899651607721e-06, + "loss": 0.7722, + "step": 1406 + }, + { + "epoch": 0.6985230234578628, + "grad_norm": 0.13173070675094867, + "learning_rate": 4.64139545352859e-06, + "loss": 0.7185, + "step": 1407 + }, + { + "epoch": 0.6990194861611021, + "grad_norm": 0.13136540290209375, + "learning_rate": 4.640890928171499e-06, + "loss": 0.7754, + "step": 1408 + }, + { + "epoch": 0.6995159488643415, + "grad_norm": 0.13827112688996815, + "learning_rate": 4.640386075613556e-06, + "loss": 0.792, + "step": 1409 + }, + { + "epoch": 0.700012411567581, + "grad_norm": 0.13975488734948466, + "learning_rate": 4.639880895931919e-06, + "loss": 0.7988, + "step": 1410 + }, + { + "epoch": 0.7005088742708204, + "grad_norm": 0.1336382331212796, + "learning_rate": 4.6393753892038e-06, + "loss": 0.7215, + "step": 1411 + }, + { + "epoch": 0.7010053369740599, + "grad_norm": 0.13724481212117196, + "learning_rate": 4.638869555506452e-06, + "loss": 0.7844, + "step": 1412 + }, + { + "epoch": 0.7015017996772992, + "grad_norm": 0.13404812402428848, + "learning_rate": 4.638363394917189e-06, + "loss": 0.7224, + "step": 1413 + }, + { + "epoch": 0.7019982623805386, + "grad_norm": 0.1407590590379482, + "learning_rate": 4.637856907513366e-06, + "loss": 0.7599, + "step": 1414 + }, + { + "epoch": 0.7024947250837781, + "grad_norm": 0.14010062720805114, + "learning_rate": 4.637350093372393e-06, + "loss": 0.7773, + "step": 1415 + }, + { + "epoch": 0.7029911877870175, + "grad_norm": 0.13348329573150353, + "learning_rate": 4.636842952571727e-06, + "loss": 0.761, + "step": 1416 + }, + { + "epoch": 0.703487650490257, + "grad_norm": 0.14369061438866643, + "learning_rate": 4.636335485188879e-06, + "loss": 0.7699, + "step": 1417 + }, + { + "epoch": 0.7039841131934963, + "grad_norm": 0.13433232422419997, + "learning_rate": 4.635827691301404e-06, + "loss": 0.7661, + "step": 1418 + }, + { + "epoch": 0.7044805758967357, + "grad_norm": 0.14032937963835693, + "learning_rate": 4.635319570986913e-06, + "loss": 0.7695, + "step": 1419 + }, + { + "epoch": 0.7049770385999752, + "grad_norm": 0.13443137094696417, + "learning_rate": 4.634811124323062e-06, + "loss": 0.7334, + "step": 1420 + }, + { + "epoch": 0.7054735013032146, + "grad_norm": 0.1342398609304696, + "learning_rate": 4.63430235138756e-06, + "loss": 0.7465, + "step": 1421 + }, + { + "epoch": 0.7059699640064541, + "grad_norm": 0.13174620950915217, + "learning_rate": 4.6337932522581656e-06, + "loss": 0.7551, + "step": 1422 + }, + { + "epoch": 0.7064664267096934, + "grad_norm": 0.1439021802463562, + "learning_rate": 4.633283827012684e-06, + "loss": 0.7224, + "step": 1423 + }, + { + "epoch": 0.7069628894129328, + "grad_norm": 0.1431482149278234, + "learning_rate": 4.632774075728974e-06, + "loss": 0.8265, + "step": 1424 + }, + { + "epoch": 0.7074593521161723, + "grad_norm": 0.13651431312093584, + "learning_rate": 4.632263998484944e-06, + "loss": 0.7448, + "step": 1425 + }, + { + "epoch": 0.7079558148194117, + "grad_norm": 0.13562176642096896, + "learning_rate": 4.63175359535855e-06, + "loss": 0.731, + "step": 1426 + }, + { + "epoch": 0.7084522775226512, + "grad_norm": 0.1331841089582722, + "learning_rate": 4.631242866427798e-06, + "loss": 0.7785, + "step": 1427 + }, + { + "epoch": 0.7089487402258905, + "grad_norm": 0.13572962352737827, + "learning_rate": 4.6307318117707465e-06, + "loss": 0.7472, + "step": 1428 + }, + { + "epoch": 0.7094452029291299, + "grad_norm": 0.1337025002321428, + "learning_rate": 4.630220431465501e-06, + "loss": 0.7177, + "step": 1429 + }, + { + "epoch": 0.7099416656323694, + "grad_norm": 0.13714690654454087, + "learning_rate": 4.629708725590219e-06, + "loss": 0.7595, + "step": 1430 + }, + { + "epoch": 0.7104381283356088, + "grad_norm": 0.13237056679093584, + "learning_rate": 4.629196694223104e-06, + "loss": 0.7719, + "step": 1431 + }, + { + "epoch": 0.7109345910388482, + "grad_norm": 0.1347783497383712, + "learning_rate": 4.628684337442414e-06, + "loss": 0.7559, + "step": 1432 + }, + { + "epoch": 0.7114310537420876, + "grad_norm": 0.14212353203573067, + "learning_rate": 4.6281716553264535e-06, + "loss": 0.7647, + "step": 1433 + }, + { + "epoch": 0.711927516445327, + "grad_norm": 0.1391353138825361, + "learning_rate": 4.627658647953579e-06, + "loss": 0.7869, + "step": 1434 + }, + { + "epoch": 0.7124239791485665, + "grad_norm": 0.1367605845380993, + "learning_rate": 4.6271453154021936e-06, + "loss": 0.7652, + "step": 1435 + }, + { + "epoch": 0.7129204418518059, + "grad_norm": 0.13776225189007474, + "learning_rate": 4.626631657750754e-06, + "loss": 0.7352, + "step": 1436 + }, + { + "epoch": 0.7134169045550453, + "grad_norm": 0.15623278977035873, + "learning_rate": 4.626117675077762e-06, + "loss": 0.772, + "step": 1437 + }, + { + "epoch": 0.7139133672582847, + "grad_norm": 0.13542364624954573, + "learning_rate": 4.625603367461775e-06, + "loss": 0.755, + "step": 1438 + }, + { + "epoch": 0.7144098299615241, + "grad_norm": 0.1363585075610717, + "learning_rate": 4.6250887349813935e-06, + "loss": 0.7923, + "step": 1439 + }, + { + "epoch": 0.7149062926647636, + "grad_norm": 0.1349317484055486, + "learning_rate": 4.6245737777152725e-06, + "loss": 0.737, + "step": 1440 + }, + { + "epoch": 0.715402755368003, + "grad_norm": 0.12840691438132237, + "learning_rate": 4.624058495742115e-06, + "loss": 0.7329, + "step": 1441 + }, + { + "epoch": 0.7158992180712423, + "grad_norm": 0.14115995132355305, + "learning_rate": 4.623542889140671e-06, + "loss": 0.7799, + "step": 1442 + }, + { + "epoch": 0.7163956807744818, + "grad_norm": 0.1357743633796934, + "learning_rate": 4.623026957989746e-06, + "loss": 0.7492, + "step": 1443 + }, + { + "epoch": 0.7168921434777212, + "grad_norm": 0.140747920382236, + "learning_rate": 4.622510702368191e-06, + "loss": 0.7822, + "step": 1444 + }, + { + "epoch": 0.7173886061809607, + "grad_norm": 0.13586217424188604, + "learning_rate": 4.621994122354907e-06, + "loss": 0.737, + "step": 1445 + }, + { + "epoch": 0.7178850688842001, + "grad_norm": 0.1319421489214161, + "learning_rate": 4.621477218028845e-06, + "loss": 0.7439, + "step": 1446 + }, + { + "epoch": 0.7183815315874394, + "grad_norm": 0.1467289582504143, + "learning_rate": 4.620959989469005e-06, + "loss": 0.7969, + "step": 1447 + }, + { + "epoch": 0.7188779942906789, + "grad_norm": 0.13593369108799191, + "learning_rate": 4.620442436754438e-06, + "loss": 0.7792, + "step": 1448 + }, + { + "epoch": 0.7193744569939183, + "grad_norm": 0.13667469147949424, + "learning_rate": 4.619924559964243e-06, + "loss": 0.7416, + "step": 1449 + }, + { + "epoch": 0.7198709196971578, + "grad_norm": 0.13941735212726125, + "learning_rate": 4.61940635917757e-06, + "loss": 0.7836, + "step": 1450 + }, + { + "epoch": 0.7203673824003972, + "grad_norm": 0.14522609940766362, + "learning_rate": 4.618887834473616e-06, + "loss": 0.8191, + "step": 1451 + }, + { + "epoch": 0.7208638451036365, + "grad_norm": 0.1347623385534868, + "learning_rate": 4.618368985931631e-06, + "loss": 0.72, + "step": 1452 + }, + { + "epoch": 0.721360307806876, + "grad_norm": 0.13229916497360666, + "learning_rate": 4.617849813630913e-06, + "loss": 0.7019, + "step": 1453 + }, + { + "epoch": 0.7218567705101154, + "grad_norm": 0.13890901285175117, + "learning_rate": 4.617330317650806e-06, + "loss": 0.7371, + "step": 1454 + }, + { + "epoch": 0.7223532332133549, + "grad_norm": 0.13269598761541726, + "learning_rate": 4.6168104980707105e-06, + "loss": 0.7291, + "step": 1455 + }, + { + "epoch": 0.7228496959165943, + "grad_norm": 0.1348717930791648, + "learning_rate": 4.61629035497007e-06, + "loss": 0.7328, + "step": 1456 + }, + { + "epoch": 0.7233461586198336, + "grad_norm": 0.13798017679082805, + "learning_rate": 4.615769888428382e-06, + "loss": 0.7664, + "step": 1457 + }, + { + "epoch": 0.7238426213230731, + "grad_norm": 0.13280778392582876, + "learning_rate": 4.615249098525189e-06, + "loss": 0.73, + "step": 1458 + }, + { + "epoch": 0.7243390840263125, + "grad_norm": 0.13189279813229704, + "learning_rate": 4.614727985340087e-06, + "loss": 0.7065, + "step": 1459 + }, + { + "epoch": 0.724835546729552, + "grad_norm": 0.1378715254587552, + "learning_rate": 4.61420654895272e-06, + "loss": 0.7774, + "step": 1460 + }, + { + "epoch": 0.7253320094327914, + "grad_norm": 0.1311464168604436, + "learning_rate": 4.613684789442781e-06, + "loss": 0.7273, + "step": 1461 + }, + { + "epoch": 0.7258284721360307, + "grad_norm": 0.14345074089926546, + "learning_rate": 4.613162706890011e-06, + "loss": 0.7582, + "step": 1462 + }, + { + "epoch": 0.7263249348392702, + "grad_norm": 0.13588405827285918, + "learning_rate": 4.612640301374204e-06, + "loss": 0.7507, + "step": 1463 + }, + { + "epoch": 0.7268213975425096, + "grad_norm": 0.13152235282939082, + "learning_rate": 4.6121175729752e-06, + "loss": 0.7425, + "step": 1464 + }, + { + "epoch": 0.7273178602457491, + "grad_norm": 0.1387161072572674, + "learning_rate": 4.611594521772891e-06, + "loss": 0.7184, + "step": 1465 + }, + { + "epoch": 0.7278143229489885, + "grad_norm": 0.13801900811363219, + "learning_rate": 4.611071147847216e-06, + "loss": 0.7633, + "step": 1466 + }, + { + "epoch": 0.7283107856522278, + "grad_norm": 0.2567131236080573, + "learning_rate": 4.610547451278164e-06, + "loss": 0.7701, + "step": 1467 + }, + { + "epoch": 0.7288072483554673, + "grad_norm": 0.14345292271439838, + "learning_rate": 4.6100234321457746e-06, + "loss": 0.749, + "step": 1468 + }, + { + "epoch": 0.7293037110587067, + "grad_norm": 0.13240897000188626, + "learning_rate": 4.6094990905301354e-06, + "loss": 0.7646, + "step": 1469 + }, + { + "epoch": 0.7298001737619462, + "grad_norm": 0.13348526628448198, + "learning_rate": 4.608974426511383e-06, + "loss": 0.7496, + "step": 1470 + }, + { + "epoch": 0.7302966364651855, + "grad_norm": 0.14970955112386783, + "learning_rate": 4.608449440169705e-06, + "loss": 0.7601, + "step": 1471 + }, + { + "epoch": 0.7307930991684249, + "grad_norm": 0.1407703027493505, + "learning_rate": 4.607924131585336e-06, + "loss": 0.7591, + "step": 1472 + }, + { + "epoch": 0.7312895618716644, + "grad_norm": 0.1318082753134835, + "learning_rate": 4.607398500838561e-06, + "loss": 0.7051, + "step": 1473 + }, + { + "epoch": 0.7317860245749038, + "grad_norm": 0.34847062208945645, + "learning_rate": 4.606872548009716e-06, + "loss": 0.7823, + "step": 1474 + }, + { + "epoch": 0.7322824872781433, + "grad_norm": 0.13028506291705924, + "learning_rate": 4.606346273179182e-06, + "loss": 0.7068, + "step": 1475 + }, + { + "epoch": 0.7327789499813826, + "grad_norm": 0.1452834099987651, + "learning_rate": 4.605819676427393e-06, + "loss": 0.7924, + "step": 1476 + }, + { + "epoch": 0.733275412684622, + "grad_norm": 0.13237347766328517, + "learning_rate": 4.605292757834832e-06, + "loss": 0.7094, + "step": 1477 + }, + { + "epoch": 0.7337718753878615, + "grad_norm": 0.14517976496072124, + "learning_rate": 4.6047655174820275e-06, + "loss": 0.7906, + "step": 1478 + }, + { + "epoch": 0.7342683380911009, + "grad_norm": 0.14057990814296611, + "learning_rate": 4.604237955449561e-06, + "loss": 0.7671, + "step": 1479 + }, + { + "epoch": 0.7347648007943404, + "grad_norm": 0.13415285581754496, + "learning_rate": 4.603710071818062e-06, + "loss": 0.6983, + "step": 1480 + }, + { + "epoch": 0.7352612634975797, + "grad_norm": 0.13508721935027235, + "learning_rate": 4.603181866668209e-06, + "loss": 0.7999, + "step": 1481 + }, + { + "epoch": 0.7357577262008191, + "grad_norm": 0.19074847415713053, + "learning_rate": 4.60265334008073e-06, + "loss": 0.7689, + "step": 1482 + }, + { + "epoch": 0.7362541889040586, + "grad_norm": 0.1359795575688059, + "learning_rate": 4.602124492136401e-06, + "loss": 0.7653, + "step": 1483 + }, + { + "epoch": 0.736750651607298, + "grad_norm": 0.15392110510905885, + "learning_rate": 4.601595322916049e-06, + "loss": 0.7726, + "step": 1484 + }, + { + "epoch": 0.7372471143105375, + "grad_norm": 0.1400383089128481, + "learning_rate": 4.601065832500548e-06, + "loss": 0.7744, + "step": 1485 + }, + { + "epoch": 0.7377435770137768, + "grad_norm": 0.14190989377886226, + "learning_rate": 4.600536020970822e-06, + "loss": 0.779, + "step": 1486 + }, + { + "epoch": 0.7382400397170162, + "grad_norm": 0.14143141374332557, + "learning_rate": 4.600005888407846e-06, + "loss": 0.8004, + "step": 1487 + }, + { + "epoch": 0.7387365024202557, + "grad_norm": 0.1448247491292055, + "learning_rate": 4.59947543489264e-06, + "loss": 0.7369, + "step": 1488 + }, + { + "epoch": 0.7392329651234951, + "grad_norm": 0.14353791020576437, + "learning_rate": 4.598944660506276e-06, + "loss": 0.7517, + "step": 1489 + }, + { + "epoch": 0.7397294278267346, + "grad_norm": 0.1397411403947714, + "learning_rate": 4.598413565329876e-06, + "loss": 0.7872, + "step": 1490 + }, + { + "epoch": 0.7402258905299739, + "grad_norm": 0.14686545557180947, + "learning_rate": 4.597882149444607e-06, + "loss": 0.7655, + "step": 1491 + }, + { + "epoch": 0.7407223532332133, + "grad_norm": 0.13888886824301397, + "learning_rate": 4.597350412931688e-06, + "loss": 0.7759, + "step": 1492 + }, + { + "epoch": 0.7412188159364528, + "grad_norm": 0.13368575317341758, + "learning_rate": 4.5968183558723876e-06, + "loss": 0.7642, + "step": 1493 + }, + { + "epoch": 0.7417152786396922, + "grad_norm": 0.13671963488130195, + "learning_rate": 4.596285978348022e-06, + "loss": 0.8001, + "step": 1494 + }, + { + "epoch": 0.7422117413429317, + "grad_norm": 0.13436579260102113, + "learning_rate": 4.595753280439955e-06, + "loss": 0.7504, + "step": 1495 + }, + { + "epoch": 0.742708204046171, + "grad_norm": 0.1350714284302843, + "learning_rate": 4.5952202622296015e-06, + "loss": 0.7258, + "step": 1496 + }, + { + "epoch": 0.7432046667494104, + "grad_norm": 0.14306485237289734, + "learning_rate": 4.594686923798426e-06, + "loss": 0.7651, + "step": 1497 + }, + { + "epoch": 0.7437011294526499, + "grad_norm": 0.13844557571686955, + "learning_rate": 4.594153265227941e-06, + "loss": 0.7099, + "step": 1498 + }, + { + "epoch": 0.7441975921558893, + "grad_norm": 0.13813214807550506, + "learning_rate": 4.5936192865997055e-06, + "loss": 0.7654, + "step": 1499 + }, + { + "epoch": 0.7446940548591288, + "grad_norm": 0.14890702452844934, + "learning_rate": 4.59308498799533e-06, + "loss": 0.7431, + "step": 1500 + }, + { + "epoch": 0.7451905175623681, + "grad_norm": 0.13792041924804616, + "learning_rate": 4.592550369496475e-06, + "loss": 0.7929, + "step": 1501 + }, + { + "epoch": 0.7456869802656075, + "grad_norm": 0.14078699310970763, + "learning_rate": 4.592015431184847e-06, + "loss": 0.7453, + "step": 1502 + }, + { + "epoch": 0.746183442968847, + "grad_norm": 0.14047870303022741, + "learning_rate": 4.591480173142204e-06, + "loss": 0.7569, + "step": 1503 + }, + { + "epoch": 0.7466799056720864, + "grad_norm": 0.1306675590393236, + "learning_rate": 4.590944595450351e-06, + "loss": 0.7193, + "step": 1504 + }, + { + "epoch": 0.7471763683753258, + "grad_norm": 0.1360716461251853, + "learning_rate": 4.590408698191142e-06, + "loss": 0.7554, + "step": 1505 + }, + { + "epoch": 0.7476728310785652, + "grad_norm": 0.13401655261545067, + "learning_rate": 4.58987248144648e-06, + "loss": 0.7505, + "step": 1506 + }, + { + "epoch": 0.7481692937818046, + "grad_norm": 0.13656970997298545, + "learning_rate": 4.589335945298318e-06, + "loss": 0.7943, + "step": 1507 + }, + { + "epoch": 0.7486657564850441, + "grad_norm": 0.13808934356621622, + "learning_rate": 4.588799089828657e-06, + "loss": 0.8049, + "step": 1508 + }, + { + "epoch": 0.7491622191882835, + "grad_norm": 0.14038587528162197, + "learning_rate": 4.588261915119547e-06, + "loss": 0.7568, + "step": 1509 + }, + { + "epoch": 0.7496586818915228, + "grad_norm": 0.13223604221937782, + "learning_rate": 4.587724421253085e-06, + "loss": 0.7214, + "step": 1510 + }, + { + "epoch": 0.7501551445947623, + "grad_norm": 0.13312725764514682, + "learning_rate": 4.5871866083114206e-06, + "loss": 0.7061, + "step": 1511 + }, + { + "epoch": 0.7506516072980017, + "grad_norm": 0.13511067606997035, + "learning_rate": 4.586648476376747e-06, + "loss": 0.7755, + "step": 1512 + }, + { + "epoch": 0.7506516072980017, + "eval_loss": 0.7611762285232544, + "eval_runtime": 135.5972, + "eval_samples_per_second": 223.847, + "eval_steps_per_second": 27.987, + "step": 1512 + }, + { + "epoch": 0.7511480700012412, + "grad_norm": 0.1379636114182231, + "learning_rate": 4.586110025531312e-06, + "loss": 0.7179, + "step": 1513 + }, + { + "epoch": 0.7516445327044806, + "grad_norm": 0.13856904591246538, + "learning_rate": 4.585571255857408e-06, + "loss": 0.7782, + "step": 1514 + }, + { + "epoch": 0.75214099540772, + "grad_norm": 0.1338118052517091, + "learning_rate": 4.585032167437375e-06, + "loss": 0.7429, + "step": 1515 + }, + { + "epoch": 0.7526374581109594, + "grad_norm": 0.12995673319277454, + "learning_rate": 4.584492760353607e-06, + "loss": 0.7371, + "step": 1516 + }, + { + "epoch": 0.7531339208141988, + "grad_norm": 0.13175985923680122, + "learning_rate": 4.583953034688544e-06, + "loss": 0.7269, + "step": 1517 + }, + { + "epoch": 0.7536303835174383, + "grad_norm": 0.13337482313493468, + "learning_rate": 4.5834129905246725e-06, + "loss": 0.7407, + "step": 1518 + }, + { + "epoch": 0.7541268462206777, + "grad_norm": 0.13450828654016309, + "learning_rate": 4.582872627944531e-06, + "loss": 0.7411, + "step": 1519 + }, + { + "epoch": 0.754623308923917, + "grad_norm": 0.13585453398292363, + "learning_rate": 4.582331947030704e-06, + "loss": 0.7709, + "step": 1520 + }, + { + "epoch": 0.7551197716271565, + "grad_norm": 0.16174753448144683, + "learning_rate": 4.581790947865827e-06, + "loss": 0.7486, + "step": 1521 + }, + { + "epoch": 0.7556162343303959, + "grad_norm": 0.13477064890424165, + "learning_rate": 4.581249630532582e-06, + "loss": 0.7218, + "step": 1522 + }, + { + "epoch": 0.7561126970336354, + "grad_norm": 0.14221417784622448, + "learning_rate": 4.580707995113703e-06, + "loss": 0.7464, + "step": 1523 + }, + { + "epoch": 0.7566091597368748, + "grad_norm": 0.1471611979618392, + "learning_rate": 4.580166041691966e-06, + "loss": 0.7741, + "step": 1524 + }, + { + "epoch": 0.7571056224401141, + "grad_norm": 0.13565970403363753, + "learning_rate": 4.579623770350205e-06, + "loss": 0.7506, + "step": 1525 + }, + { + "epoch": 0.7576020851433536, + "grad_norm": 0.14016327819488653, + "learning_rate": 4.579081181171292e-06, + "loss": 0.7694, + "step": 1526 + }, + { + "epoch": 0.758098547846593, + "grad_norm": 0.13870513168528667, + "learning_rate": 4.5785382742381586e-06, + "loss": 0.7733, + "step": 1527 + }, + { + "epoch": 0.7585950105498325, + "grad_norm": 0.14061847589896362, + "learning_rate": 4.577995049633776e-06, + "loss": 0.708, + "step": 1528 + }, + { + "epoch": 0.7590914732530719, + "grad_norm": 0.12927094131164107, + "learning_rate": 4.577451507441167e-06, + "loss": 0.7336, + "step": 1529 + }, + { + "epoch": 0.7595879359563112, + "grad_norm": 0.13010758376806392, + "learning_rate": 4.576907647743406e-06, + "loss": 0.7284, + "step": 1530 + }, + { + "epoch": 0.7600843986595507, + "grad_norm": 0.13474374832851801, + "learning_rate": 4.576363470623612e-06, + "loss": 0.7211, + "step": 1531 + }, + { + "epoch": 0.7605808613627901, + "grad_norm": 0.13588121835212055, + "learning_rate": 4.575818976164952e-06, + "loss": 0.7767, + "step": 1532 + }, + { + "epoch": 0.7610773240660296, + "grad_norm": 0.13581473484517131, + "learning_rate": 4.575274164450645e-06, + "loss": 0.7967, + "step": 1533 + }, + { + "epoch": 0.761573786769269, + "grad_norm": 0.1319619163329429, + "learning_rate": 4.574729035563957e-06, + "loss": 0.7489, + "step": 1534 + }, + { + "epoch": 0.7620702494725083, + "grad_norm": 0.13070368454070141, + "learning_rate": 4.574183589588202e-06, + "loss": 0.7371, + "step": 1535 + }, + { + "epoch": 0.7625667121757478, + "grad_norm": 0.14405906001792207, + "learning_rate": 4.573637826606742e-06, + "loss": 0.8019, + "step": 1536 + }, + { + "epoch": 0.7630631748789872, + "grad_norm": 0.13830904744938607, + "learning_rate": 4.573091746702988e-06, + "loss": 0.8099, + "step": 1537 + }, + { + "epoch": 0.7635596375822267, + "grad_norm": 0.1308369875528322, + "learning_rate": 4.572545349960401e-06, + "loss": 0.7099, + "step": 1538 + }, + { + "epoch": 0.764056100285466, + "grad_norm": 0.13746899245422908, + "learning_rate": 4.571998636462487e-06, + "loss": 0.7422, + "step": 1539 + }, + { + "epoch": 0.7645525629887054, + "grad_norm": 0.13435052506307477, + "learning_rate": 4.571451606292803e-06, + "loss": 0.7412, + "step": 1540 + }, + { + "epoch": 0.7650490256919449, + "grad_norm": 0.14575768487845997, + "learning_rate": 4.570904259534955e-06, + "loss": 0.799, + "step": 1541 + }, + { + "epoch": 0.7655454883951843, + "grad_norm": 0.13296485164519237, + "learning_rate": 4.570356596272596e-06, + "loss": 0.7189, + "step": 1542 + }, + { + "epoch": 0.7660419510984238, + "grad_norm": 0.13484403439852938, + "learning_rate": 4.569808616589426e-06, + "loss": 0.7768, + "step": 1543 + }, + { + "epoch": 0.7665384138016631, + "grad_norm": 0.1332711897109076, + "learning_rate": 4.569260320569196e-06, + "loss": 0.756, + "step": 1544 + }, + { + "epoch": 0.7670348765049025, + "grad_norm": 0.1419358962518124, + "learning_rate": 4.568711708295704e-06, + "loss": 0.7668, + "step": 1545 + }, + { + "epoch": 0.767531339208142, + "grad_norm": 0.13596711085205604, + "learning_rate": 4.5681627798527965e-06, + "loss": 0.7775, + "step": 1546 + }, + { + "epoch": 0.7680278019113814, + "grad_norm": 0.13788837250657413, + "learning_rate": 4.5676135353243685e-06, + "loss": 0.724, + "step": 1547 + }, + { + "epoch": 0.7685242646146209, + "grad_norm": 0.13462158663256213, + "learning_rate": 4.567063974794363e-06, + "loss": 0.7606, + "step": 1548 + }, + { + "epoch": 0.7690207273178602, + "grad_norm": 0.14640713050413612, + "learning_rate": 4.566514098346774e-06, + "loss": 0.75, + "step": 1549 + }, + { + "epoch": 0.7695171900210996, + "grad_norm": 0.14008125739494734, + "learning_rate": 4.565963906065637e-06, + "loss": 0.7769, + "step": 1550 + }, + { + "epoch": 0.7700136527243391, + "grad_norm": 0.13457373398967837, + "learning_rate": 4.565413398035043e-06, + "loss": 0.7223, + "step": 1551 + }, + { + "epoch": 0.7705101154275785, + "grad_norm": 0.12909038799878353, + "learning_rate": 4.564862574339126e-06, + "loss": 0.7108, + "step": 1552 + }, + { + "epoch": 0.771006578130818, + "grad_norm": 0.15132140494001553, + "learning_rate": 4.564311435062074e-06, + "loss": 0.7578, + "step": 1553 + }, + { + "epoch": 0.7715030408340573, + "grad_norm": 0.13338663206299184, + "learning_rate": 4.563759980288117e-06, + "loss": 0.7337, + "step": 1554 + }, + { + "epoch": 0.7719995035372967, + "grad_norm": 0.13671337018894522, + "learning_rate": 4.563208210101536e-06, + "loss": 0.7435, + "step": 1555 + }, + { + "epoch": 0.7724959662405362, + "grad_norm": 0.1339512782588895, + "learning_rate": 4.562656124586663e-06, + "loss": 0.734, + "step": 1556 + }, + { + "epoch": 0.7729924289437756, + "grad_norm": 0.13189544572945966, + "learning_rate": 4.562103723827872e-06, + "loss": 0.7639, + "step": 1557 + }, + { + "epoch": 0.7734888916470151, + "grad_norm": 0.13412150276620746, + "learning_rate": 4.561551007909592e-06, + "loss": 0.7409, + "step": 1558 + }, + { + "epoch": 0.7739853543502544, + "grad_norm": 0.13352894722705605, + "learning_rate": 4.560997976916293e-06, + "loss": 0.7254, + "step": 1559 + }, + { + "epoch": 0.7744818170534938, + "grad_norm": 0.13385714130707613, + "learning_rate": 4.560444630932499e-06, + "loss": 0.7355, + "step": 1560 + }, + { + "epoch": 0.7749782797567333, + "grad_norm": 0.14383353757373613, + "learning_rate": 4.5598909700427805e-06, + "loss": 0.7632, + "step": 1561 + }, + { + "epoch": 0.7754747424599727, + "grad_norm": 0.13057620170286605, + "learning_rate": 4.559336994331755e-06, + "loss": 0.74, + "step": 1562 + }, + { + "epoch": 0.7759712051632122, + "grad_norm": 0.22403616831916165, + "learning_rate": 4.558782703884089e-06, + "loss": 0.7815, + "step": 1563 + }, + { + "epoch": 0.7764676678664515, + "grad_norm": 0.13988728351688395, + "learning_rate": 4.558228098784496e-06, + "loss": 0.8111, + "step": 1564 + }, + { + "epoch": 0.7769641305696909, + "grad_norm": 0.12851052315735476, + "learning_rate": 4.55767317911774e-06, + "loss": 0.7076, + "step": 1565 + }, + { + "epoch": 0.7774605932729304, + "grad_norm": 0.13445147845150343, + "learning_rate": 4.557117944968631e-06, + "loss": 0.7776, + "step": 1566 + }, + { + "epoch": 0.7779570559761698, + "grad_norm": 0.13626735223788075, + "learning_rate": 4.556562396422027e-06, + "loss": 0.7418, + "step": 1567 + }, + { + "epoch": 0.7784535186794093, + "grad_norm": 0.13902885288869005, + "learning_rate": 4.5560065335628356e-06, + "loss": 0.7585, + "step": 1568 + }, + { + "epoch": 0.7789499813826486, + "grad_norm": 0.1347101759097276, + "learning_rate": 4.55545035647601e-06, + "loss": 0.7497, + "step": 1569 + }, + { + "epoch": 0.779446444085888, + "grad_norm": 0.14453358307933684, + "learning_rate": 4.5548938652465555e-06, + "loss": 0.7764, + "step": 1570 + }, + { + "epoch": 0.7799429067891275, + "grad_norm": 0.13507717127592933, + "learning_rate": 4.55433705995952e-06, + "loss": 0.7964, + "step": 1571 + }, + { + "epoch": 0.7804393694923669, + "grad_norm": 0.13356730248477877, + "learning_rate": 4.553779940700005e-06, + "loss": 0.7519, + "step": 1572 + }, + { + "epoch": 0.7809358321956062, + "grad_norm": 0.14113785981123475, + "learning_rate": 4.553222507553155e-06, + "loss": 0.6896, + "step": 1573 + }, + { + "epoch": 0.7814322948988457, + "grad_norm": 0.13798920017452704, + "learning_rate": 4.552664760604167e-06, + "loss": 0.753, + "step": 1574 + }, + { + "epoch": 0.7819287576020851, + "grad_norm": 0.13634927023321175, + "learning_rate": 4.552106699938281e-06, + "loss": 0.7779, + "step": 1575 + }, + { + "epoch": 0.7824252203053246, + "grad_norm": 0.13862438111784472, + "learning_rate": 4.551548325640789e-06, + "loss": 0.7653, + "step": 1576 + }, + { + "epoch": 0.782921683008564, + "grad_norm": 0.13229593912177584, + "learning_rate": 4.550989637797031e-06, + "loss": 0.7289, + "step": 1577 + }, + { + "epoch": 0.7834181457118033, + "grad_norm": 0.13125159194957922, + "learning_rate": 4.55043063649239e-06, + "loss": 0.7287, + "step": 1578 + }, + { + "epoch": 0.7839146084150428, + "grad_norm": 0.1417064775776418, + "learning_rate": 4.549871321812304e-06, + "loss": 0.7426, + "step": 1579 + }, + { + "epoch": 0.7844110711182822, + "grad_norm": 0.16883124210144626, + "learning_rate": 4.549311693842252e-06, + "loss": 0.8431, + "step": 1580 + }, + { + "epoch": 0.7849075338215217, + "grad_norm": 0.13414566338046124, + "learning_rate": 4.548751752667767e-06, + "loss": 0.8359, + "step": 1581 + }, + { + "epoch": 0.7854039965247611, + "grad_norm": 0.13725844757239852, + "learning_rate": 4.548191498374425e-06, + "loss": 0.755, + "step": 1582 + }, + { + "epoch": 0.7859004592280004, + "grad_norm": 0.1361745571866967, + "learning_rate": 4.547630931047853e-06, + "loss": 0.7442, + "step": 1583 + }, + { + "epoch": 0.7863969219312399, + "grad_norm": 0.13750113879876855, + "learning_rate": 4.547070050773725e-06, + "loss": 0.7196, + "step": 1584 + }, + { + "epoch": 0.7868933846344793, + "grad_norm": 0.13909283760787944, + "learning_rate": 4.5465088576377614e-06, + "loss": 0.77, + "step": 1585 + }, + { + "epoch": 0.7873898473377188, + "grad_norm": 0.13324992099967756, + "learning_rate": 4.545947351725732e-06, + "loss": 0.7228, + "step": 1586 + }, + { + "epoch": 0.7878863100409582, + "grad_norm": 0.13413613523960244, + "learning_rate": 4.5453855331234555e-06, + "loss": 0.7636, + "step": 1587 + }, + { + "epoch": 0.7883827727441975, + "grad_norm": 0.13214222144352114, + "learning_rate": 4.544823401916794e-06, + "loss": 0.7119, + "step": 1588 + }, + { + "epoch": 0.788879235447437, + "grad_norm": 0.1347073970533374, + "learning_rate": 4.544260958191663e-06, + "loss": 0.7696, + "step": 1589 + }, + { + "epoch": 0.7893756981506764, + "grad_norm": 0.1322596137845652, + "learning_rate": 4.543698202034021e-06, + "loss": 0.7173, + "step": 1590 + }, + { + "epoch": 0.7898721608539159, + "grad_norm": 0.13812558700775102, + "learning_rate": 4.543135133529878e-06, + "loss": 0.7625, + "step": 1591 + }, + { + "epoch": 0.7903686235571553, + "grad_norm": 0.13570198883227674, + "learning_rate": 4.542571752765288e-06, + "loss": 0.8061, + "step": 1592 + }, + { + "epoch": 0.7908650862603946, + "grad_norm": 0.1502291763200914, + "learning_rate": 4.542008059826356e-06, + "loss": 0.7479, + "step": 1593 + }, + { + "epoch": 0.7913615489636341, + "grad_norm": 0.14188040098609095, + "learning_rate": 4.5414440547992325e-06, + "loss": 0.7453, + "step": 1594 + }, + { + "epoch": 0.7918580116668735, + "grad_norm": 0.12979186868674533, + "learning_rate": 4.540879737770118e-06, + "loss": 0.7411, + "step": 1595 + }, + { + "epoch": 0.792354474370113, + "grad_norm": 0.13210637876846007, + "learning_rate": 4.540315108825258e-06, + "loss": 0.7322, + "step": 1596 + }, + { + "epoch": 0.7928509370733524, + "grad_norm": 0.13744920964363747, + "learning_rate": 4.539750168050949e-06, + "loss": 0.7314, + "step": 1597 + }, + { + "epoch": 0.7933473997765917, + "grad_norm": 0.13854762530674888, + "learning_rate": 4.539184915533531e-06, + "loss": 0.7586, + "step": 1598 + }, + { + "epoch": 0.7938438624798312, + "grad_norm": 0.14216681342036058, + "learning_rate": 4.538619351359393e-06, + "loss": 0.7504, + "step": 1599 + }, + { + "epoch": 0.7943403251830706, + "grad_norm": 0.13158698944804878, + "learning_rate": 4.538053475614976e-06, + "loss": 0.7344, + "step": 1600 + }, + { + "epoch": 0.7948367878863101, + "grad_norm": 0.133052297447574, + "learning_rate": 4.537487288386763e-06, + "loss": 0.7265, + "step": 1601 + }, + { + "epoch": 0.7953332505895495, + "grad_norm": 0.14026215243529064, + "learning_rate": 4.536920789761286e-06, + "loss": 0.7422, + "step": 1602 + }, + { + "epoch": 0.7958297132927888, + "grad_norm": 0.13205252140563034, + "learning_rate": 4.536353979825125e-06, + "loss": 0.7223, + "step": 1603 + }, + { + "epoch": 0.7963261759960283, + "grad_norm": 0.12972258505388326, + "learning_rate": 4.535786858664909e-06, + "loss": 0.7183, + "step": 1604 + }, + { + "epoch": 0.7968226386992677, + "grad_norm": 0.14485812514741278, + "learning_rate": 4.5352194263673135e-06, + "loss": 0.7563, + "step": 1605 + }, + { + "epoch": 0.7973191014025072, + "grad_norm": 0.14507460323696492, + "learning_rate": 4.534651683019061e-06, + "loss": 0.7467, + "step": 1606 + }, + { + "epoch": 0.7978155641057466, + "grad_norm": 0.13781694651061116, + "learning_rate": 4.534083628706921e-06, + "loss": 0.7728, + "step": 1607 + }, + { + "epoch": 0.7983120268089859, + "grad_norm": 0.13742843647821426, + "learning_rate": 4.533515263517713e-06, + "loss": 0.7739, + "step": 1608 + }, + { + "epoch": 0.7988084895122254, + "grad_norm": 0.14505009227353202, + "learning_rate": 4.532946587538302e-06, + "loss": 0.7943, + "step": 1609 + }, + { + "epoch": 0.7993049522154648, + "grad_norm": 0.13478499327557095, + "learning_rate": 4.532377600855601e-06, + "loss": 0.744, + "step": 1610 + }, + { + "epoch": 0.7998014149187043, + "grad_norm": 0.13368137981304712, + "learning_rate": 4.53180830355657e-06, + "loss": 0.7207, + "step": 1611 + }, + { + "epoch": 0.8002978776219436, + "grad_norm": 0.1367615628975349, + "learning_rate": 4.531238695728218e-06, + "loss": 0.7439, + "step": 1612 + }, + { + "epoch": 0.800794340325183, + "grad_norm": 0.1414644058349886, + "learning_rate": 4.5306687774576e-06, + "loss": 0.8015, + "step": 1613 + }, + { + "epoch": 0.8012908030284225, + "grad_norm": 0.1367196701213475, + "learning_rate": 4.530098548831817e-06, + "loss": 0.7596, + "step": 1614 + }, + { + "epoch": 0.8017872657316619, + "grad_norm": 0.13522642664685247, + "learning_rate": 4.529528009938022e-06, + "loss": 0.7498, + "step": 1615 + }, + { + "epoch": 0.8022837284349014, + "grad_norm": 0.13889758013185657, + "learning_rate": 4.528957160863412e-06, + "loss": 0.7589, + "step": 1616 + }, + { + "epoch": 0.8027801911381407, + "grad_norm": 0.13673089456510074, + "learning_rate": 4.528386001695232e-06, + "loss": 0.7202, + "step": 1617 + }, + { + "epoch": 0.8032766538413801, + "grad_norm": 0.13642986188283496, + "learning_rate": 4.5278145325207735e-06, + "loss": 0.7115, + "step": 1618 + }, + { + "epoch": 0.8037731165446196, + "grad_norm": 0.13872017776841988, + "learning_rate": 4.527242753427378e-06, + "loss": 0.7473, + "step": 1619 + }, + { + "epoch": 0.804269579247859, + "grad_norm": 0.13515176865610115, + "learning_rate": 4.526670664502432e-06, + "loss": 0.7328, + "step": 1620 + }, + { + "epoch": 0.8047660419510985, + "grad_norm": 0.13643944196282173, + "learning_rate": 4.52609826583337e-06, + "loss": 0.7182, + "step": 1621 + }, + { + "epoch": 0.8052625046543378, + "grad_norm": 0.13670018877356768, + "learning_rate": 4.525525557507673e-06, + "loss": 0.7556, + "step": 1622 + }, + { + "epoch": 0.8057589673575772, + "grad_norm": 0.13504339939566765, + "learning_rate": 4.524952539612872e-06, + "loss": 0.7584, + "step": 1623 + }, + { + "epoch": 0.8062554300608167, + "grad_norm": 0.13368086807436755, + "learning_rate": 4.524379212236544e-06, + "loss": 0.7667, + "step": 1624 + }, + { + "epoch": 0.8067518927640561, + "grad_norm": 0.14428096858814662, + "learning_rate": 4.5238055754663105e-06, + "loss": 0.8251, + "step": 1625 + }, + { + "epoch": 0.8072483554672956, + "grad_norm": 0.14219994124125016, + "learning_rate": 4.523231629389845e-06, + "loss": 0.739, + "step": 1626 + }, + { + "epoch": 0.8077448181705349, + "grad_norm": 0.1326897765805104, + "learning_rate": 4.522657374094864e-06, + "loss": 0.7558, + "step": 1627 + }, + { + "epoch": 0.8082412808737743, + "grad_norm": 0.13799541492346612, + "learning_rate": 4.522082809669135e-06, + "loss": 0.7563, + "step": 1628 + }, + { + "epoch": 0.8087377435770138, + "grad_norm": 0.13141846846705796, + "learning_rate": 4.52150793620047e-06, + "loss": 0.6994, + "step": 1629 + }, + { + "epoch": 0.8092342062802532, + "grad_norm": 0.14133853029523705, + "learning_rate": 4.520932753776729e-06, + "loss": 0.7406, + "step": 1630 + }, + { + "epoch": 0.8097306689834927, + "grad_norm": 0.14262392679819444, + "learning_rate": 4.52035726248582e-06, + "loss": 0.7419, + "step": 1631 + }, + { + "epoch": 0.810227131686732, + "grad_norm": 0.13946667026700096, + "learning_rate": 4.519781462415698e-06, + "loss": 0.7623, + "step": 1632 + }, + { + "epoch": 0.8107235943899714, + "grad_norm": 0.14076729287492634, + "learning_rate": 4.5192053536543636e-06, + "loss": 0.7801, + "step": 1633 + }, + { + "epoch": 0.8112200570932109, + "grad_norm": 0.132725847901949, + "learning_rate": 4.518628936289867e-06, + "loss": 0.7443, + "step": 1634 + }, + { + "epoch": 0.8117165197964503, + "grad_norm": 0.13691408254882964, + "learning_rate": 4.518052210410303e-06, + "loss": 0.7553, + "step": 1635 + }, + { + "epoch": 0.8122129824996898, + "grad_norm": 0.13655790072644422, + "learning_rate": 4.517475176103816e-06, + "loss": 0.7799, + "step": 1636 + }, + { + "epoch": 0.8127094452029291, + "grad_norm": 0.13537469163429455, + "learning_rate": 4.5168978334585955e-06, + "loss": 0.7422, + "step": 1637 + }, + { + "epoch": 0.8132059079061685, + "grad_norm": 0.13341240046444236, + "learning_rate": 4.5163201825628805e-06, + "loss": 0.7611, + "step": 1638 + }, + { + "epoch": 0.813702370609408, + "grad_norm": 0.1330300275048859, + "learning_rate": 4.515742223504954e-06, + "loss": 0.7526, + "step": 1639 + }, + { + "epoch": 0.8141988333126474, + "grad_norm": 0.1289270904071567, + "learning_rate": 4.51516395637315e-06, + "loss": 0.6907, + "step": 1640 + }, + { + "epoch": 0.8146952960158869, + "grad_norm": 0.13653838617553693, + "learning_rate": 4.514585381255845e-06, + "loss": 0.7272, + "step": 1641 + }, + { + "epoch": 0.8151917587191262, + "grad_norm": 0.14220864005729456, + "learning_rate": 4.514006498241465e-06, + "loss": 0.7533, + "step": 1642 + }, + { + "epoch": 0.8156882214223656, + "grad_norm": 0.1334721980787763, + "learning_rate": 4.513427307418485e-06, + "loss": 0.7727, + "step": 1643 + }, + { + "epoch": 0.8161846841256051, + "grad_norm": 0.13759063653846024, + "learning_rate": 4.512847808875424e-06, + "loss": 0.7862, + "step": 1644 + }, + { + "epoch": 0.8166811468288445, + "grad_norm": 0.13689081776996725, + "learning_rate": 4.512268002700848e-06, + "loss": 0.7217, + "step": 1645 + }, + { + "epoch": 0.8171776095320838, + "grad_norm": 0.1447472368767314, + "learning_rate": 4.5116878889833735e-06, + "loss": 0.7671, + "step": 1646 + }, + { + "epoch": 0.8176740722353233, + "grad_norm": 0.13925674131676116, + "learning_rate": 4.511107467811659e-06, + "loss": 0.7461, + "step": 1647 + }, + { + "epoch": 0.8181705349385627, + "grad_norm": 0.12956181530658847, + "learning_rate": 4.510526739274415e-06, + "loss": 0.7147, + "step": 1648 + }, + { + "epoch": 0.8186669976418022, + "grad_norm": 0.14156922012695827, + "learning_rate": 4.509945703460394e-06, + "loss": 0.7724, + "step": 1649 + }, + { + "epoch": 0.8191634603450416, + "grad_norm": 0.1337498799210342, + "learning_rate": 4.509364360458399e-06, + "loss": 0.7494, + "step": 1650 + }, + { + "epoch": 0.819659923048281, + "grad_norm": 0.14328558587229834, + "learning_rate": 4.50878271035728e-06, + "loss": 0.7505, + "step": 1651 + }, + { + "epoch": 0.8201563857515204, + "grad_norm": 0.13902637442350718, + "learning_rate": 4.508200753245932e-06, + "loss": 0.7722, + "step": 1652 + }, + { + "epoch": 0.8206528484547598, + "grad_norm": 0.13220848269342542, + "learning_rate": 4.507618489213298e-06, + "loss": 0.7764, + "step": 1653 + }, + { + "epoch": 0.8211493111579993, + "grad_norm": 0.1327753210553815, + "learning_rate": 4.507035918348367e-06, + "loss": 0.7135, + "step": 1654 + }, + { + "epoch": 0.8216457738612387, + "grad_norm": 0.1406321284403388, + "learning_rate": 4.506453040740177e-06, + "loss": 0.7467, + "step": 1655 + }, + { + "epoch": 0.822142236564478, + "grad_norm": 0.13632541689225586, + "learning_rate": 4.505869856477811e-06, + "loss": 0.749, + "step": 1656 + }, + { + "epoch": 0.8226386992677175, + "grad_norm": 0.14246540537344085, + "learning_rate": 4.505286365650398e-06, + "loss": 0.7677, + "step": 1657 + }, + { + "epoch": 0.8231351619709569, + "grad_norm": 0.13667688867874997, + "learning_rate": 4.504702568347117e-06, + "loss": 0.7307, + "step": 1658 + }, + { + "epoch": 0.8236316246741964, + "grad_norm": 0.1431423731753008, + "learning_rate": 4.5041184646571915e-06, + "loss": 0.7707, + "step": 1659 + }, + { + "epoch": 0.8241280873774358, + "grad_norm": 0.14005994273435252, + "learning_rate": 4.5035340546698915e-06, + "loss": 0.7612, + "step": 1660 + }, + { + "epoch": 0.8246245500806751, + "grad_norm": 0.13515045815804208, + "learning_rate": 4.502949338474536e-06, + "loss": 0.7746, + "step": 1661 + }, + { + "epoch": 0.8251210127839146, + "grad_norm": 0.13147659327086783, + "learning_rate": 4.50236431616049e-06, + "loss": 0.7014, + "step": 1662 + }, + { + "epoch": 0.825617475487154, + "grad_norm": 0.13900795898461954, + "learning_rate": 4.501778987817162e-06, + "loss": 0.7509, + "step": 1663 + }, + { + "epoch": 0.8261139381903935, + "grad_norm": 0.139057514709019, + "learning_rate": 4.501193353534013e-06, + "loss": 0.7598, + "step": 1664 + }, + { + "epoch": 0.8266104008936329, + "grad_norm": 0.1431266182489607, + "learning_rate": 4.500607413400546e-06, + "loss": 0.7914, + "step": 1665 + }, + { + "epoch": 0.8271068635968722, + "grad_norm": 0.1334091878020073, + "learning_rate": 4.5000211675063134e-06, + "loss": 0.7663, + "step": 1666 + }, + { + "epoch": 0.8276033263001117, + "grad_norm": 0.13230619594672582, + "learning_rate": 4.499434615940913e-06, + "loss": 0.751, + "step": 1667 + }, + { + "epoch": 0.8280997890033511, + "grad_norm": 0.14337035164539053, + "learning_rate": 4.498847758793991e-06, + "loss": 0.7694, + "step": 1668 + }, + { + "epoch": 0.8285962517065906, + "grad_norm": 0.13666058128401123, + "learning_rate": 4.498260596155237e-06, + "loss": 0.7791, + "step": 1669 + }, + { + "epoch": 0.82909271440983, + "grad_norm": 0.14465851413099565, + "learning_rate": 4.497673128114391e-06, + "loss": 0.7803, + "step": 1670 + }, + { + "epoch": 0.8295891771130693, + "grad_norm": 0.14652649532294804, + "learning_rate": 4.497085354761237e-06, + "loss": 0.7799, + "step": 1671 + }, + { + "epoch": 0.8300856398163088, + "grad_norm": 0.13131787380504173, + "learning_rate": 4.4964972761856086e-06, + "loss": 0.7452, + "step": 1672 + }, + { + "epoch": 0.8305821025195482, + "grad_norm": 0.14145954525931556, + "learning_rate": 4.495908892477382e-06, + "loss": 0.7794, + "step": 1673 + }, + { + "epoch": 0.8310785652227877, + "grad_norm": 0.1384021322282935, + "learning_rate": 4.495320203726483e-06, + "loss": 0.7108, + "step": 1674 + }, + { + "epoch": 0.8315750279260271, + "grad_norm": 0.1415771904329555, + "learning_rate": 4.494731210022884e-06, + "loss": 0.7311, + "step": 1675 + }, + { + "epoch": 0.8320714906292664, + "grad_norm": 0.1438025629578128, + "learning_rate": 4.494141911456602e-06, + "loss": 0.7992, + "step": 1676 + }, + { + "epoch": 0.8325679533325059, + "grad_norm": 0.1414714443428166, + "learning_rate": 4.4935523081177035e-06, + "loss": 0.7469, + "step": 1677 + }, + { + "epoch": 0.8330644160357453, + "grad_norm": 0.14271477816026024, + "learning_rate": 4.492962400096298e-06, + "loss": 0.8226, + "step": 1678 + }, + { + "epoch": 0.8335608787389848, + "grad_norm": 0.1443375820025856, + "learning_rate": 4.492372187482545e-06, + "loss": 0.8516, + "step": 1679 + }, + { + "epoch": 0.8340573414422241, + "grad_norm": 0.13815933437015357, + "learning_rate": 4.491781670366648e-06, + "loss": 0.8077, + "step": 1680 + }, + { + "epoch": 0.8345538041454635, + "grad_norm": 0.1388616626139753, + "learning_rate": 4.491190848838858e-06, + "loss": 0.7864, + "step": 1681 + }, + { + "epoch": 0.835050266848703, + "grad_norm": 0.13570857271523626, + "learning_rate": 4.490599722989474e-06, + "loss": 0.7412, + "step": 1682 + }, + { + "epoch": 0.8355467295519424, + "grad_norm": 0.1438978989274344, + "learning_rate": 4.490008292908839e-06, + "loss": 0.7208, + "step": 1683 + }, + { + "epoch": 0.8360431922551819, + "grad_norm": 0.143547018622754, + "learning_rate": 4.4894165586873426e-06, + "loss": 0.7816, + "step": 1684 + }, + { + "epoch": 0.8365396549584212, + "grad_norm": 0.13635442176917034, + "learning_rate": 4.488824520415425e-06, + "loss": 0.7406, + "step": 1685 + }, + { + "epoch": 0.8370361176616606, + "grad_norm": 0.14562544936887342, + "learning_rate": 4.4882321781835666e-06, + "loss": 0.7833, + "step": 1686 + }, + { + "epoch": 0.8375325803649001, + "grad_norm": 0.14484628055837453, + "learning_rate": 4.4876395320822984e-06, + "loss": 0.7634, + "step": 1687 + }, + { + "epoch": 0.8380290430681395, + "grad_norm": 0.13816444882771123, + "learning_rate": 4.487046582202198e-06, + "loss": 0.7416, + "step": 1688 + }, + { + "epoch": 0.838525505771379, + "grad_norm": 0.14266194907778323, + "learning_rate": 4.486453328633887e-06, + "loss": 0.7377, + "step": 1689 + }, + { + "epoch": 0.8390219684746183, + "grad_norm": 0.14297267750408402, + "learning_rate": 4.485859771468035e-06, + "loss": 0.8454, + "step": 1690 + }, + { + "epoch": 0.8395184311778577, + "grad_norm": 0.1403189883914047, + "learning_rate": 4.4852659107953574e-06, + "loss": 0.7529, + "step": 1691 + }, + { + "epoch": 0.8400148938810972, + "grad_norm": 0.13495181072938314, + "learning_rate": 4.484671746706617e-06, + "loss": 0.7426, + "step": 1692 + }, + { + "epoch": 0.8405113565843366, + "grad_norm": 0.13845259745230812, + "learning_rate": 4.484077279292622e-06, + "loss": 0.768, + "step": 1693 + }, + { + "epoch": 0.8410078192875761, + "grad_norm": 0.1386205743503297, + "learning_rate": 4.483482508644228e-06, + "loss": 0.7478, + "step": 1694 + }, + { + "epoch": 0.8415042819908154, + "grad_norm": 0.14075629580146526, + "learning_rate": 4.482887434852334e-06, + "loss": 0.7795, + "step": 1695 + }, + { + "epoch": 0.8420007446940548, + "grad_norm": 0.13730753156141765, + "learning_rate": 4.4822920580078885e-06, + "loss": 0.7747, + "step": 1696 + }, + { + "epoch": 0.8424972073972943, + "grad_norm": 0.13372905034092983, + "learning_rate": 4.481696378201887e-06, + "loss": 0.7619, + "step": 1697 + }, + { + "epoch": 0.8429936701005337, + "grad_norm": 0.13367135782867837, + "learning_rate": 4.481100395525367e-06, + "loss": 0.7271, + "step": 1698 + }, + { + "epoch": 0.8434901328037732, + "grad_norm": 0.15581577968511073, + "learning_rate": 4.4805041100694165e-06, + "loss": 0.7648, + "step": 1699 + }, + { + "epoch": 0.8439865955070125, + "grad_norm": 0.1401851479710573, + "learning_rate": 4.479907521925168e-06, + "loss": 0.7723, + "step": 1700 + }, + { + "epoch": 0.8444830582102519, + "grad_norm": 0.13725236828476695, + "learning_rate": 4.4793106311838e-06, + "loss": 0.7257, + "step": 1701 + }, + { + "epoch": 0.8449795209134914, + "grad_norm": 0.1356076078010008, + "learning_rate": 4.478713437936538e-06, + "loss": 0.7374, + "step": 1702 + }, + { + "epoch": 0.8454759836167308, + "grad_norm": 0.13425339120243374, + "learning_rate": 4.478115942274652e-06, + "loss": 0.7241, + "step": 1703 + }, + { + "epoch": 0.8459724463199703, + "grad_norm": 0.13679058651123055, + "learning_rate": 4.477518144289462e-06, + "loss": 0.766, + "step": 1704 + }, + { + "epoch": 0.8464689090232096, + "grad_norm": 0.13984700650107668, + "learning_rate": 4.476920044072331e-06, + "loss": 0.7733, + "step": 1705 + }, + { + "epoch": 0.846965371726449, + "grad_norm": 0.13407682285557562, + "learning_rate": 4.476321641714669e-06, + "loss": 0.7057, + "step": 1706 + }, + { + "epoch": 0.8474618344296885, + "grad_norm": 0.13417781616310068, + "learning_rate": 4.475722937307931e-06, + "loss": 0.7953, + "step": 1707 + }, + { + "epoch": 0.8479582971329279, + "grad_norm": 0.13220445140764078, + "learning_rate": 4.475123930943621e-06, + "loss": 0.6939, + "step": 1708 + }, + { + "epoch": 0.8484547598361674, + "grad_norm": 0.13757038634337526, + "learning_rate": 4.474524622713286e-06, + "loss": 0.7181, + "step": 1709 + }, + { + "epoch": 0.8489512225394067, + "grad_norm": 0.1405832726304441, + "learning_rate": 4.473925012708522e-06, + "loss": 0.7676, + "step": 1710 + }, + { + "epoch": 0.8494476852426461, + "grad_norm": 0.14268436213062238, + "learning_rate": 4.47332510102097e-06, + "loss": 0.7663, + "step": 1711 + }, + { + "epoch": 0.8499441479458856, + "grad_norm": 0.13994019393605722, + "learning_rate": 4.472724887742316e-06, + "loss": 0.7941, + "step": 1712 + }, + { + "epoch": 0.850440610649125, + "grad_norm": 0.13811450219788193, + "learning_rate": 4.472124372964292e-06, + "loss": 0.734, + "step": 1713 + }, + { + "epoch": 0.8509370733523643, + "grad_norm": 0.1318935825701855, + "learning_rate": 4.471523556778679e-06, + "loss": 0.739, + "step": 1714 + }, + { + "epoch": 0.8514335360556038, + "grad_norm": 0.1419282323412836, + "learning_rate": 4.470922439277301e-06, + "loss": 0.7729, + "step": 1715 + }, + { + "epoch": 0.8519299987588432, + "grad_norm": 0.1393041197530227, + "learning_rate": 4.47032102055203e-06, + "loss": 0.7483, + "step": 1716 + }, + { + "epoch": 0.8524264614620827, + "grad_norm": 0.13626087249042654, + "learning_rate": 4.469719300694783e-06, + "loss": 0.7349, + "step": 1717 + }, + { + "epoch": 0.8529229241653221, + "grad_norm": 0.1464856140158772, + "learning_rate": 4.469117279797522e-06, + "loss": 0.836, + "step": 1718 + }, + { + "epoch": 0.8534193868685614, + "grad_norm": 0.1409940881870901, + "learning_rate": 4.468514957952258e-06, + "loss": 0.7472, + "step": 1719 + }, + { + "epoch": 0.8539158495718009, + "grad_norm": 0.14285146112331779, + "learning_rate": 4.467912335251045e-06, + "loss": 0.8371, + "step": 1720 + }, + { + "epoch": 0.8544123122750403, + "grad_norm": 0.13806293901326697, + "learning_rate": 4.467309411785984e-06, + "loss": 0.8033, + "step": 1721 + }, + { + "epoch": 0.8549087749782798, + "grad_norm": 0.1361187990693462, + "learning_rate": 4.466706187649223e-06, + "loss": 0.7249, + "step": 1722 + }, + { + "epoch": 0.8554052376815192, + "grad_norm": 0.1392046645207596, + "learning_rate": 4.466102662932956e-06, + "loss": 0.7949, + "step": 1723 + }, + { + "epoch": 0.8559017003847585, + "grad_norm": 0.1402331299424571, + "learning_rate": 4.46549883772942e-06, + "loss": 0.8077, + "step": 1724 + }, + { + "epoch": 0.856398163087998, + "grad_norm": 0.1350641709477664, + "learning_rate": 4.464894712130902e-06, + "loss": 0.7314, + "step": 1725 + }, + { + "epoch": 0.8568946257912374, + "grad_norm": 0.13330313530654128, + "learning_rate": 4.464290286229731e-06, + "loss": 0.7255, + "step": 1726 + }, + { + "epoch": 0.8573910884944769, + "grad_norm": 0.14647673202761563, + "learning_rate": 4.463685560118285e-06, + "loss": 0.7336, + "step": 1727 + }, + { + "epoch": 0.8578875511977163, + "grad_norm": 0.13358377643813468, + "learning_rate": 4.463080533888987e-06, + "loss": 0.7289, + "step": 1728 + }, + { + "epoch": 0.8583840139009556, + "grad_norm": 0.16223228182246507, + "learning_rate": 4.4624752076343044e-06, + "loss": 0.8098, + "step": 1729 + }, + { + "epoch": 0.8588804766041951, + "grad_norm": 0.1362449815913255, + "learning_rate": 4.461869581446752e-06, + "loss": 0.7671, + "step": 1730 + }, + { + "epoch": 0.8593769393074345, + "grad_norm": 0.14081968859441232, + "learning_rate": 4.461263655418891e-06, + "loss": 0.6943, + "step": 1731 + }, + { + "epoch": 0.859873402010674, + "grad_norm": 0.13606349832132275, + "learning_rate": 4.460657429643326e-06, + "loss": 0.7334, + "step": 1732 + }, + { + "epoch": 0.8603698647139134, + "grad_norm": 0.1369487135112441, + "learning_rate": 4.460050904212711e-06, + "loss": 0.7225, + "step": 1733 + }, + { + "epoch": 0.8608663274171527, + "grad_norm": 0.14809100220976384, + "learning_rate": 4.45944407921974e-06, + "loss": 0.7686, + "step": 1734 + }, + { + "epoch": 0.8613627901203922, + "grad_norm": 0.13703527982602742, + "learning_rate": 4.458836954757161e-06, + "loss": 0.7545, + "step": 1735 + }, + { + "epoch": 0.8618592528236316, + "grad_norm": 0.1351821399394306, + "learning_rate": 4.4582295309177595e-06, + "loss": 0.7797, + "step": 1736 + }, + { + "epoch": 0.8623557155268711, + "grad_norm": 0.1354673204843339, + "learning_rate": 4.457621807794372e-06, + "loss": 0.8066, + "step": 1737 + }, + { + "epoch": 0.8628521782301105, + "grad_norm": 0.13862706776026357, + "learning_rate": 4.457013785479881e-06, + "loss": 0.7333, + "step": 1738 + }, + { + "epoch": 0.8633486409333498, + "grad_norm": 0.1389346115440188, + "learning_rate": 4.45640546406721e-06, + "loss": 0.764, + "step": 1739 + }, + { + "epoch": 0.8638451036365893, + "grad_norm": 0.15558347243623133, + "learning_rate": 4.455796843649332e-06, + "loss": 0.7593, + "step": 1740 + }, + { + "epoch": 0.8643415663398287, + "grad_norm": 0.14353099958802945, + "learning_rate": 4.455187924319266e-06, + "loss": 0.7507, + "step": 1741 + }, + { + "epoch": 0.8648380290430682, + "grad_norm": 0.14119369358264125, + "learning_rate": 4.454578706170075e-06, + "loss": 0.7599, + "step": 1742 + }, + { + "epoch": 0.8653344917463076, + "grad_norm": 0.13360839981041983, + "learning_rate": 4.453969189294867e-06, + "loss": 0.7387, + "step": 1743 + }, + { + "epoch": 0.8658309544495469, + "grad_norm": 0.14010427578913046, + "learning_rate": 4.453359373786799e-06, + "loss": 0.754, + "step": 1744 + }, + { + "epoch": 0.8663274171527864, + "grad_norm": 0.13347970173643106, + "learning_rate": 4.45274925973907e-06, + "loss": 0.7363, + "step": 1745 + }, + { + "epoch": 0.8668238798560258, + "grad_norm": 0.143214337774515, + "learning_rate": 4.4521388472449265e-06, + "loss": 0.74, + "step": 1746 + }, + { + "epoch": 0.8673203425592653, + "grad_norm": 0.14170555954343675, + "learning_rate": 4.451528136397661e-06, + "loss": 0.74, + "step": 1747 + }, + { + "epoch": 0.8678168052625047, + "grad_norm": 0.13910626879867338, + "learning_rate": 4.450917127290609e-06, + "loss": 0.7129, + "step": 1748 + }, + { + "epoch": 0.868313267965744, + "grad_norm": 0.14451162598319245, + "learning_rate": 4.4503058200171566e-06, + "loss": 0.7683, + "step": 1749 + }, + { + "epoch": 0.8688097306689835, + "grad_norm": 0.13670177426956756, + "learning_rate": 4.449694214670729e-06, + "loss": 0.7196, + "step": 1750 + }, + { + "epoch": 0.8693061933722229, + "grad_norm": 0.1425374537551891, + "learning_rate": 4.449082311344802e-06, + "loss": 0.713, + "step": 1751 + }, + { + "epoch": 0.8698026560754624, + "grad_norm": 0.13684146510608788, + "learning_rate": 4.4484701101328944e-06, + "loss": 0.6968, + "step": 1752 + }, + { + "epoch": 0.8702991187787017, + "grad_norm": 0.13936638023848308, + "learning_rate": 4.447857611128572e-06, + "loss": 0.7731, + "step": 1753 + }, + { + "epoch": 0.8707955814819411, + "grad_norm": 0.14653517321071075, + "learning_rate": 4.447244814425446e-06, + "loss": 0.7236, + "step": 1754 + }, + { + "epoch": 0.8712920441851806, + "grad_norm": 0.14331030940384665, + "learning_rate": 4.446631720117171e-06, + "loss": 0.7306, + "step": 1755 + }, + { + "epoch": 0.87178850688842, + "grad_norm": 0.14673685332839945, + "learning_rate": 4.446018328297449e-06, + "loss": 0.7242, + "step": 1756 + }, + { + "epoch": 0.8722849695916595, + "grad_norm": 0.1347449091302609, + "learning_rate": 4.445404639060028e-06, + "loss": 0.7709, + "step": 1757 + }, + { + "epoch": 0.8727814322948988, + "grad_norm": 0.1411266352695579, + "learning_rate": 4.4447906524987006e-06, + "loss": 0.8022, + "step": 1758 + }, + { + "epoch": 0.8732778949981382, + "grad_norm": 0.14208496504174006, + "learning_rate": 4.444176368707305e-06, + "loss": 0.7066, + "step": 1759 + }, + { + "epoch": 0.8737743577013777, + "grad_norm": 0.13674759170668294, + "learning_rate": 4.443561787779722e-06, + "loss": 0.7553, + "step": 1760 + }, + { + "epoch": 0.8742708204046171, + "grad_norm": 0.14090838321939578, + "learning_rate": 4.442946909809884e-06, + "loss": 0.704, + "step": 1761 + }, + { + "epoch": 0.8747672831078566, + "grad_norm": 0.14668605063528445, + "learning_rate": 4.442331734891763e-06, + "loss": 0.7503, + "step": 1762 + }, + { + "epoch": 0.8752637458110959, + "grad_norm": 0.1397458175530683, + "learning_rate": 4.441716263119379e-06, + "loss": 0.7832, + "step": 1763 + }, + { + "epoch": 0.8757602085143353, + "grad_norm": 0.138318109593277, + "learning_rate": 4.441100494586797e-06, + "loss": 0.7535, + "step": 1764 + }, + { + "epoch": 0.8762566712175748, + "grad_norm": 0.14189795702675218, + "learning_rate": 4.4404844293881285e-06, + "loss": 0.7643, + "step": 1765 + }, + { + "epoch": 0.8767531339208142, + "grad_norm": 0.13684604258689606, + "learning_rate": 4.439868067617528e-06, + "loss": 0.7602, + "step": 1766 + }, + { + "epoch": 0.8772495966240537, + "grad_norm": 0.13657716615346063, + "learning_rate": 4.4392514093691965e-06, + "loss": 0.7519, + "step": 1767 + }, + { + "epoch": 0.877746059327293, + "grad_norm": 0.16720007998528186, + "learning_rate": 4.43863445473738e-06, + "loss": 0.7311, + "step": 1768 + }, + { + "epoch": 0.8782425220305324, + "grad_norm": 0.13527051039700383, + "learning_rate": 4.4380172038163716e-06, + "loss": 0.727, + "step": 1769 + }, + { + "epoch": 0.8787389847337719, + "grad_norm": 0.13977003542487368, + "learning_rate": 4.437399656700507e-06, + "loss": 0.7412, + "step": 1770 + }, + { + "epoch": 0.8792354474370113, + "grad_norm": 0.14066430808137664, + "learning_rate": 4.436781813484169e-06, + "loss": 0.7626, + "step": 1771 + }, + { + "epoch": 0.8797319101402508, + "grad_norm": 0.14316718597333672, + "learning_rate": 4.436163674261785e-06, + "loss": 0.783, + "step": 1772 + }, + { + "epoch": 0.8802283728434901, + "grad_norm": 0.14181967035602686, + "learning_rate": 4.435545239127827e-06, + "loss": 0.7452, + "step": 1773 + }, + { + "epoch": 0.8807248355467295, + "grad_norm": 0.13364947209125708, + "learning_rate": 4.434926508176814e-06, + "loss": 0.7446, + "step": 1774 + }, + { + "epoch": 0.881221298249969, + "grad_norm": 0.1351435150199207, + "learning_rate": 4.434307481503307e-06, + "loss": 0.7263, + "step": 1775 + }, + { + "epoch": 0.8817177609532084, + "grad_norm": 0.13242522480531707, + "learning_rate": 4.433688159201917e-06, + "loss": 0.6855, + "step": 1776 + }, + { + "epoch": 0.8822142236564479, + "grad_norm": 0.1392138600504658, + "learning_rate": 4.433068541367296e-06, + "loss": 0.7762, + "step": 1777 + }, + { + "epoch": 0.8827106863596872, + "grad_norm": 0.13595036312344588, + "learning_rate": 4.432448628094142e-06, + "loss": 0.7368, + "step": 1778 + }, + { + "epoch": 0.8832071490629266, + "grad_norm": 0.14043982715211084, + "learning_rate": 4.4318284194772e-06, + "loss": 0.7809, + "step": 1779 + }, + { + "epoch": 0.8837036117661661, + "grad_norm": 0.13928983515854973, + "learning_rate": 4.431207915611259e-06, + "loss": 0.7963, + "step": 1780 + }, + { + "epoch": 0.8842000744694055, + "grad_norm": 0.15323129087107526, + "learning_rate": 4.430587116591152e-06, + "loss": 0.7638, + "step": 1781 + }, + { + "epoch": 0.884696537172645, + "grad_norm": 0.13672127380953056, + "learning_rate": 4.429966022511759e-06, + "loss": 0.7521, + "step": 1782 + }, + { + "epoch": 0.8851929998758843, + "grad_norm": 0.13675982862960748, + "learning_rate": 4.429344633468005e-06, + "loss": 0.735, + "step": 1783 + }, + { + "epoch": 0.8856894625791237, + "grad_norm": 0.14468317226179433, + "learning_rate": 4.428722949554858e-06, + "loss": 0.7728, + "step": 1784 + }, + { + "epoch": 0.8861859252823632, + "grad_norm": 0.1392240635874152, + "learning_rate": 4.428100970867332e-06, + "loss": 0.7631, + "step": 1785 + }, + { + "epoch": 0.8866823879856026, + "grad_norm": 0.13905551197842414, + "learning_rate": 4.4274786975004886e-06, + "loss": 0.7664, + "step": 1786 + }, + { + "epoch": 0.887178850688842, + "grad_norm": 0.13218082313555352, + "learning_rate": 4.426856129549431e-06, + "loss": 0.7664, + "step": 1787 + }, + { + "epoch": 0.8876753133920814, + "grad_norm": 0.14542219733133013, + "learning_rate": 4.426233267109308e-06, + "loss": 0.7537, + "step": 1788 + }, + { + "epoch": 0.8881717760953208, + "grad_norm": 0.13937675043101774, + "learning_rate": 4.4256101102753154e-06, + "loss": 0.7624, + "step": 1789 + }, + { + "epoch": 0.8886682387985603, + "grad_norm": 0.14213592075320755, + "learning_rate": 4.424986659142691e-06, + "loss": 0.7745, + "step": 1790 + }, + { + "epoch": 0.8891647015017997, + "grad_norm": 0.13222515034482818, + "learning_rate": 4.424362913806723e-06, + "loss": 0.7704, + "step": 1791 + }, + { + "epoch": 0.889661164205039, + "grad_norm": 0.12997849285210467, + "learning_rate": 4.423738874362737e-06, + "loss": 0.7473, + "step": 1792 + }, + { + "epoch": 0.8901576269082785, + "grad_norm": 0.13865947461800895, + "learning_rate": 4.423114540906108e-06, + "loss": 0.7922, + "step": 1793 + }, + { + "epoch": 0.8906540896115179, + "grad_norm": 0.1400478476922931, + "learning_rate": 4.422489913532258e-06, + "loss": 0.7136, + "step": 1794 + }, + { + "epoch": 0.8911505523147574, + "grad_norm": 0.13354935512552799, + "learning_rate": 4.421864992336648e-06, + "loss": 0.7358, + "step": 1795 + }, + { + "epoch": 0.8916470150179968, + "grad_norm": 0.1503327640511868, + "learning_rate": 4.421239777414789e-06, + "loss": 0.73, + "step": 1796 + }, + { + "epoch": 0.8921434777212361, + "grad_norm": 0.1378043512733515, + "learning_rate": 4.4206142688622365e-06, + "loss": 0.7978, + "step": 1797 + }, + { + "epoch": 0.8926399404244756, + "grad_norm": 0.14901127137101272, + "learning_rate": 4.4199884667745866e-06, + "loss": 0.8067, + "step": 1798 + }, + { + "epoch": 0.893136403127715, + "grad_norm": 0.14348536864057324, + "learning_rate": 4.419362371247483e-06, + "loss": 0.7077, + "step": 1799 + }, + { + "epoch": 0.8936328658309545, + "grad_norm": 0.14175662380934076, + "learning_rate": 4.418735982376617e-06, + "loss": 0.7913, + "step": 1800 + }, + { + "epoch": 0.8941293285341939, + "grad_norm": 0.1358379514515194, + "learning_rate": 4.41810930025772e-06, + "loss": 0.7867, + "step": 1801 + }, + { + "epoch": 0.8946257912374332, + "grad_norm": 0.13785444500689084, + "learning_rate": 4.417482324986572e-06, + "loss": 0.8063, + "step": 1802 + }, + { + "epoch": 0.8951222539406727, + "grad_norm": 0.13967910977720893, + "learning_rate": 4.416855056658994e-06, + "loss": 0.7678, + "step": 1803 + }, + { + "epoch": 0.8956187166439121, + "grad_norm": 0.13147269887815224, + "learning_rate": 4.416227495370855e-06, + "loss": 0.7064, + "step": 1804 + }, + { + "epoch": 0.8961151793471516, + "grad_norm": 0.13960191918704914, + "learning_rate": 4.415599641218068e-06, + "loss": 0.7231, + "step": 1805 + }, + { + "epoch": 0.896611642050391, + "grad_norm": 0.15042506597615363, + "learning_rate": 4.4149714942965905e-06, + "loss": 0.7629, + "step": 1806 + }, + { + "epoch": 0.8971081047536303, + "grad_norm": 0.15012721023542636, + "learning_rate": 4.414343054702424e-06, + "loss": 0.8244, + "step": 1807 + }, + { + "epoch": 0.8976045674568698, + "grad_norm": 0.13677278853363833, + "learning_rate": 4.4137143225316155e-06, + "loss": 0.7664, + "step": 1808 + }, + { + "epoch": 0.8981010301601092, + "grad_norm": 0.13593716948768078, + "learning_rate": 4.4130852978802575e-06, + "loss": 0.7305, + "step": 1809 + }, + { + "epoch": 0.8985974928633487, + "grad_norm": 0.13364129430992383, + "learning_rate": 4.412455980844486e-06, + "loss": 0.7374, + "step": 1810 + }, + { + "epoch": 0.8990939555665881, + "grad_norm": 0.1383082700694781, + "learning_rate": 4.4118263715204805e-06, + "loss": 0.7336, + "step": 1811 + }, + { + "epoch": 0.8995904182698274, + "grad_norm": 0.13423570328137593, + "learning_rate": 4.4111964700044684e-06, + "loss": 0.7759, + "step": 1812 + }, + { + "epoch": 0.9000868809730669, + "grad_norm": 0.1427047436168598, + "learning_rate": 4.41056627639272e-06, + "loss": 0.7461, + "step": 1813 + }, + { + "epoch": 0.9005833436763063, + "grad_norm": 0.14114806673952404, + "learning_rate": 4.409935790781549e-06, + "loss": 0.8111, + "step": 1814 + }, + { + "epoch": 0.9010798063795458, + "grad_norm": 0.13948474917166498, + "learning_rate": 4.4093050132673166e-06, + "loss": 0.7487, + "step": 1815 + }, + { + "epoch": 0.9015762690827852, + "grad_norm": 0.13399238294763033, + "learning_rate": 4.408673943946426e-06, + "loss": 0.7499, + "step": 1816 + }, + { + "epoch": 0.9020727317860245, + "grad_norm": 0.13587971531472334, + "learning_rate": 4.408042582915327e-06, + "loss": 0.7214, + "step": 1817 + }, + { + "epoch": 0.902569194489264, + "grad_norm": 0.14273283326406191, + "learning_rate": 4.407410930270512e-06, + "loss": 0.726, + "step": 1818 + }, + { + "epoch": 0.9030656571925034, + "grad_norm": 0.1377272028925061, + "learning_rate": 4.406778986108519e-06, + "loss": 0.7314, + "step": 1819 + }, + { + "epoch": 0.9035621198957429, + "grad_norm": 0.14415656316000652, + "learning_rate": 4.406146750525931e-06, + "loss": 0.8112, + "step": 1820 + }, + { + "epoch": 0.9040585825989822, + "grad_norm": 0.13299537098888492, + "learning_rate": 4.405514223619375e-06, + "loss": 0.7636, + "step": 1821 + }, + { + "epoch": 0.9045550453022216, + "grad_norm": 0.14030718986113996, + "learning_rate": 4.404881405485522e-06, + "loss": 0.7662, + "step": 1822 + }, + { + "epoch": 0.9050515080054611, + "grad_norm": 0.1285002264716999, + "learning_rate": 4.404248296221089e-06, + "loss": 0.7578, + "step": 1823 + }, + { + "epoch": 0.9055479707087005, + "grad_norm": 0.1357249456374853, + "learning_rate": 4.4036148959228365e-06, + "loss": 0.785, + "step": 1824 + }, + { + "epoch": 0.90604443341194, + "grad_norm": 0.13335345428150927, + "learning_rate": 4.402981204687569e-06, + "loss": 0.7519, + "step": 1825 + }, + { + "epoch": 0.9065408961151793, + "grad_norm": 0.13323854464179258, + "learning_rate": 4.402347222612137e-06, + "loss": 0.7363, + "step": 1826 + }, + { + "epoch": 0.9070373588184187, + "grad_norm": 0.1480468732609157, + "learning_rate": 4.401712949793433e-06, + "loss": 0.8581, + "step": 1827 + }, + { + "epoch": 0.9075338215216582, + "grad_norm": 0.13471889336102866, + "learning_rate": 4.401078386328397e-06, + "loss": 0.7886, + "step": 1828 + }, + { + "epoch": 0.9080302842248976, + "grad_norm": 0.1330688142356741, + "learning_rate": 4.40044353231401e-06, + "loss": 0.7205, + "step": 1829 + }, + { + "epoch": 0.9085267469281371, + "grad_norm": 0.14615671078844134, + "learning_rate": 4.3998083878473004e-06, + "loss": 0.7106, + "step": 1830 + }, + { + "epoch": 0.9090232096313764, + "grad_norm": 0.13549720868989765, + "learning_rate": 4.399172953025341e-06, + "loss": 0.7009, + "step": 1831 + }, + { + "epoch": 0.9095196723346158, + "grad_norm": 0.13185385773550817, + "learning_rate": 4.398537227945246e-06, + "loss": 0.7404, + "step": 1832 + }, + { + "epoch": 0.9100161350378553, + "grad_norm": 0.13266084243229623, + "learning_rate": 4.397901212704176e-06, + "loss": 0.762, + "step": 1833 + }, + { + "epoch": 0.9105125977410947, + "grad_norm": 0.1359129457194815, + "learning_rate": 4.397264907399337e-06, + "loss": 0.7739, + "step": 1834 + }, + { + "epoch": 0.9110090604443342, + "grad_norm": 0.13102612310251532, + "learning_rate": 4.396628312127977e-06, + "loss": 0.7081, + "step": 1835 + }, + { + "epoch": 0.9115055231475735, + "grad_norm": 0.1352565063423689, + "learning_rate": 4.39599142698739e-06, + "loss": 0.7689, + "step": 1836 + }, + { + "epoch": 0.9120019858508129, + "grad_norm": 0.1358199459420324, + "learning_rate": 4.395354252074912e-06, + "loss": 0.7418, + "step": 1837 + }, + { + "epoch": 0.9124984485540524, + "grad_norm": 0.1340420997123764, + "learning_rate": 4.394716787487928e-06, + "loss": 0.7555, + "step": 1838 + }, + { + "epoch": 0.9129949112572918, + "grad_norm": 0.14029534457477436, + "learning_rate": 4.394079033323862e-06, + "loss": 0.759, + "step": 1839 + }, + { + "epoch": 0.9134913739605313, + "grad_norm": 0.14168093185036393, + "learning_rate": 4.393440989680184e-06, + "loss": 0.793, + "step": 1840 + }, + { + "epoch": 0.9139878366637706, + "grad_norm": 0.13613343802332856, + "learning_rate": 4.39280265665441e-06, + "loss": 0.7622, + "step": 1841 + }, + { + "epoch": 0.91448429936701, + "grad_norm": 0.134141801326751, + "learning_rate": 4.392164034344099e-06, + "loss": 0.7318, + "step": 1842 + }, + { + "epoch": 0.9149807620702495, + "grad_norm": 0.13523796345465022, + "learning_rate": 4.391525122846855e-06, + "loss": 0.7306, + "step": 1843 + }, + { + "epoch": 0.9154772247734889, + "grad_norm": 0.1404574112739095, + "learning_rate": 4.390885922260323e-06, + "loss": 0.7078, + "step": 1844 + }, + { + "epoch": 0.9159736874767284, + "grad_norm": 0.1336817191525083, + "learning_rate": 4.390246432682196e-06, + "loss": 0.7827, + "step": 1845 + }, + { + "epoch": 0.9164701501799677, + "grad_norm": 0.13488164789434137, + "learning_rate": 4.389606654210209e-06, + "loss": 0.7829, + "step": 1846 + }, + { + "epoch": 0.9169666128832071, + "grad_norm": 0.13753302958393473, + "learning_rate": 4.388966586942144e-06, + "loss": 0.7592, + "step": 1847 + }, + { + "epoch": 0.9174630755864466, + "grad_norm": 0.13535888688211714, + "learning_rate": 4.388326230975822e-06, + "loss": 0.7047, + "step": 1848 + }, + { + "epoch": 0.917959538289686, + "grad_norm": 0.15778012406959804, + "learning_rate": 4.387685586409113e-06, + "loss": 0.8187, + "step": 1849 + }, + { + "epoch": 0.9184560009929255, + "grad_norm": 0.1421612541209436, + "learning_rate": 4.387044653339929e-06, + "loss": 0.7156, + "step": 1850 + }, + { + "epoch": 0.9189524636961648, + "grad_norm": 0.13379900149196902, + "learning_rate": 4.386403431866227e-06, + "loss": 0.7026, + "step": 1851 + }, + { + "epoch": 0.9194489263994042, + "grad_norm": 0.13738592358777368, + "learning_rate": 4.385761922086006e-06, + "loss": 0.7395, + "step": 1852 + }, + { + "epoch": 0.9199453891026437, + "grad_norm": 0.13390600462410002, + "learning_rate": 4.385120124097311e-06, + "loss": 0.7409, + "step": 1853 + }, + { + "epoch": 0.9204418518058831, + "grad_norm": 0.16039084660182953, + "learning_rate": 4.38447803799823e-06, + "loss": 0.7908, + "step": 1854 + }, + { + "epoch": 0.9209383145091226, + "grad_norm": 0.14623098390188546, + "learning_rate": 4.383835663886897e-06, + "loss": 0.8183, + "step": 1855 + }, + { + "epoch": 0.9214347772123619, + "grad_norm": 0.13634108193684458, + "learning_rate": 4.383193001861488e-06, + "loss": 0.7497, + "step": 1856 + }, + { + "epoch": 0.9219312399156013, + "grad_norm": 0.14386362828828383, + "learning_rate": 4.382550052020223e-06, + "loss": 0.7665, + "step": 1857 + }, + { + "epoch": 0.9224277026188408, + "grad_norm": 0.14077375377005916, + "learning_rate": 4.381906814461366e-06, + "loss": 0.7573, + "step": 1858 + }, + { + "epoch": 0.9229241653220802, + "grad_norm": 0.13040249185121516, + "learning_rate": 4.3812632892832275e-06, + "loss": 0.7399, + "step": 1859 + }, + { + "epoch": 0.9234206280253195, + "grad_norm": 0.14035305401909165, + "learning_rate": 4.38061947658416e-06, + "loss": 0.7425, + "step": 1860 + }, + { + "epoch": 0.923917090728559, + "grad_norm": 0.13852311964581557, + "learning_rate": 4.379975376462557e-06, + "loss": 0.7366, + "step": 1861 + }, + { + "epoch": 0.9244135534317984, + "grad_norm": 0.14569169577045035, + "learning_rate": 4.379330989016861e-06, + "loss": 0.7598, + "step": 1862 + }, + { + "epoch": 0.9249100161350379, + "grad_norm": 0.1420209523517496, + "learning_rate": 4.3786863143455575e-06, + "loss": 0.7468, + "step": 1863 + }, + { + "epoch": 0.9254064788382773, + "grad_norm": 0.13850510279377587, + "learning_rate": 4.378041352547172e-06, + "loss": 0.7813, + "step": 1864 + }, + { + "epoch": 0.9259029415415166, + "grad_norm": 0.13571804405986027, + "learning_rate": 4.3773961037202784e-06, + "loss": 0.7678, + "step": 1865 + }, + { + "epoch": 0.9263994042447561, + "grad_norm": 0.15050220230033115, + "learning_rate": 4.3767505679634925e-06, + "loss": 0.7502, + "step": 1866 + }, + { + "epoch": 0.9268958669479955, + "grad_norm": 0.13968131538073705, + "learning_rate": 4.376104745375474e-06, + "loss": 0.7503, + "step": 1867 + }, + { + "epoch": 0.927392329651235, + "grad_norm": 0.15025837625476948, + "learning_rate": 4.375458636054924e-06, + "loss": 0.7845, + "step": 1868 + }, + { + "epoch": 0.9278887923544744, + "grad_norm": 0.14063528342813086, + "learning_rate": 4.374812240100594e-06, + "loss": 0.7252, + "step": 1869 + }, + { + "epoch": 0.9283852550577137, + "grad_norm": 0.13713965606972475, + "learning_rate": 4.374165557611273e-06, + "loss": 0.7426, + "step": 1870 + }, + { + "epoch": 0.9288817177609532, + "grad_norm": 0.13695944367515048, + "learning_rate": 4.373518588685797e-06, + "loss": 0.7409, + "step": 1871 + }, + { + "epoch": 0.9293781804641926, + "grad_norm": 0.1422748389549011, + "learning_rate": 4.372871333423044e-06, + "loss": 0.7377, + "step": 1872 + }, + { + "epoch": 0.9298746431674321, + "grad_norm": 0.15953047007232668, + "learning_rate": 4.372223791921937e-06, + "loss": 0.794, + "step": 1873 + }, + { + "epoch": 0.9303711058706715, + "grad_norm": 0.14027429808379158, + "learning_rate": 4.371575964281441e-06, + "loss": 0.7338, + "step": 1874 + }, + { + "epoch": 0.9308675685739108, + "grad_norm": 0.1402138448734584, + "learning_rate": 4.370927850600569e-06, + "loss": 0.8293, + "step": 1875 + }, + { + "epoch": 0.9313640312771503, + "grad_norm": 0.14210713241359957, + "learning_rate": 4.370279450978372e-06, + "loss": 0.724, + "step": 1876 + }, + { + "epoch": 0.9318604939803897, + "grad_norm": 0.13815508146885405, + "learning_rate": 4.369630765513949e-06, + "loss": 0.7071, + "step": 1877 + }, + { + "epoch": 0.9323569566836292, + "grad_norm": 0.13952059008632328, + "learning_rate": 4.368981794306441e-06, + "loss": 0.7539, + "step": 1878 + }, + { + "epoch": 0.9328534193868686, + "grad_norm": 0.13939971283229288, + "learning_rate": 4.368332537455032e-06, + "loss": 0.7861, + "step": 1879 + }, + { + "epoch": 0.9333498820901079, + "grad_norm": 0.1419171606522094, + "learning_rate": 4.367682995058952e-06, + "loss": 0.7177, + "step": 1880 + }, + { + "epoch": 0.9338463447933474, + "grad_norm": 0.1368329170972017, + "learning_rate": 4.367033167217472e-06, + "loss": 0.6944, + "step": 1881 + }, + { + "epoch": 0.9343428074965868, + "grad_norm": 0.13781096600616807, + "learning_rate": 4.366383054029907e-06, + "loss": 0.7615, + "step": 1882 + }, + { + "epoch": 0.9348392701998263, + "grad_norm": 0.13708727022446426, + "learning_rate": 4.365732655595618e-06, + "loss": 0.743, + "step": 1883 + }, + { + "epoch": 0.9353357329030657, + "grad_norm": 0.13672878592574392, + "learning_rate": 4.365081972014007e-06, + "loss": 0.7352, + "step": 1884 + }, + { + "epoch": 0.935832195606305, + "grad_norm": 0.1367173447197288, + "learning_rate": 4.364431003384522e-06, + "loss": 0.7197, + "step": 1885 + }, + { + "epoch": 0.9363286583095445, + "grad_norm": 0.13990056273075885, + "learning_rate": 4.36377974980665e-06, + "loss": 0.7777, + "step": 1886 + }, + { + "epoch": 0.9368251210127839, + "grad_norm": 0.1330138557768458, + "learning_rate": 4.363128211379929e-06, + "loss": 0.7605, + "step": 1887 + }, + { + "epoch": 0.9373215837160234, + "grad_norm": 0.13245661584732554, + "learning_rate": 4.362476388203932e-06, + "loss": 0.6882, + "step": 1888 + }, + { + "epoch": 0.9378180464192628, + "grad_norm": 0.13413118235673072, + "learning_rate": 4.361824280378283e-06, + "loss": 0.7746, + "step": 1889 + }, + { + "epoch": 0.9383145091225021, + "grad_norm": 0.1482744865905161, + "learning_rate": 4.361171888002644e-06, + "loss": 0.6962, + "step": 1890 + }, + { + "epoch": 0.9388109718257416, + "grad_norm": 0.13948550276160163, + "learning_rate": 4.360519211176724e-06, + "loss": 0.7621, + "step": 1891 + }, + { + "epoch": 0.939307434528981, + "grad_norm": 0.13306011715981336, + "learning_rate": 4.359866250000273e-06, + "loss": 0.7034, + "step": 1892 + }, + { + "epoch": 0.9398038972322205, + "grad_norm": 0.13596845635437071, + "learning_rate": 4.359213004573087e-06, + "loss": 0.743, + "step": 1893 + }, + { + "epoch": 0.9403003599354598, + "grad_norm": 0.1353452240695457, + "learning_rate": 4.358559474995003e-06, + "loss": 0.7207, + "step": 1894 + }, + { + "epoch": 0.9407968226386992, + "grad_norm": 0.14241184989931713, + "learning_rate": 4.357905661365904e-06, + "loss": 0.7474, + "step": 1895 + }, + { + "epoch": 0.9412932853419387, + "grad_norm": 0.13833487675781403, + "learning_rate": 4.357251563785712e-06, + "loss": 0.783, + "step": 1896 + }, + { + "epoch": 0.9417897480451781, + "grad_norm": 0.14544580018979297, + "learning_rate": 4.3565971823543995e-06, + "loss": 0.7439, + "step": 1897 + }, + { + "epoch": 0.9422862107484176, + "grad_norm": 0.13830700272336566, + "learning_rate": 4.355942517171975e-06, + "loss": 0.7761, + "step": 1898 + }, + { + "epoch": 0.9427826734516569, + "grad_norm": 0.13022704197917237, + "learning_rate": 4.355287568338494e-06, + "loss": 0.7591, + "step": 1899 + }, + { + "epoch": 0.9432791361548963, + "grad_norm": 0.14112497002640229, + "learning_rate": 4.354632335954056e-06, + "loss": 0.7541, + "step": 1900 + }, + { + "epoch": 0.9437755988581358, + "grad_norm": 0.13978038311901106, + "learning_rate": 4.353976820118803e-06, + "loss": 0.7811, + "step": 1901 + }, + { + "epoch": 0.9442720615613752, + "grad_norm": 0.14646514863621188, + "learning_rate": 4.353321020932918e-06, + "loss": 0.77, + "step": 1902 + }, + { + "epoch": 0.9447685242646147, + "grad_norm": 0.14275771666539322, + "learning_rate": 4.352664938496631e-06, + "loss": 0.7156, + "step": 1903 + }, + { + "epoch": 0.945264986967854, + "grad_norm": 0.15199585201866644, + "learning_rate": 4.352008572910213e-06, + "loss": 0.7803, + "step": 1904 + }, + { + "epoch": 0.9457614496710934, + "grad_norm": 0.15012570466464772, + "learning_rate": 4.35135192427398e-06, + "loss": 0.8116, + "step": 1905 + }, + { + "epoch": 0.9462579123743329, + "grad_norm": 0.14798566149846193, + "learning_rate": 4.350694992688289e-06, + "loss": 0.7983, + "step": 1906 + }, + { + "epoch": 0.9467543750775723, + "grad_norm": 0.13120199262792004, + "learning_rate": 4.350037778253543e-06, + "loss": 0.6915, + "step": 1907 + }, + { + "epoch": 0.9472508377808118, + "grad_norm": 0.14145729405767365, + "learning_rate": 4.3493802810701845e-06, + "loss": 0.7309, + "step": 1908 + }, + { + "epoch": 0.9477473004840511, + "grad_norm": 0.15086944052092097, + "learning_rate": 4.348722501238704e-06, + "loss": 0.744, + "step": 1909 + }, + { + "epoch": 0.9482437631872905, + "grad_norm": 0.14048210428493602, + "learning_rate": 4.348064438859629e-06, + "loss": 0.8235, + "step": 1910 + }, + { + "epoch": 0.94874022589053, + "grad_norm": 0.14254823425879515, + "learning_rate": 4.347406094033539e-06, + "loss": 0.7813, + "step": 1911 + }, + { + "epoch": 0.9492366885937694, + "grad_norm": 0.1346422118252251, + "learning_rate": 4.346747466861046e-06, + "loss": 0.7329, + "step": 1912 + }, + { + "epoch": 0.9497331512970089, + "grad_norm": 0.13446205456154095, + "learning_rate": 4.346088557442813e-06, + "loss": 0.7271, + "step": 1913 + }, + { + "epoch": 0.9502296140002482, + "grad_norm": 0.13403300687387085, + "learning_rate": 4.345429365879545e-06, + "loss": 0.7879, + "step": 1914 + }, + { + "epoch": 0.9507260767034876, + "grad_norm": 0.13602743859277094, + "learning_rate": 4.344769892271987e-06, + "loss": 0.7527, + "step": 1915 + }, + { + "epoch": 0.9512225394067271, + "grad_norm": 0.13274614412032776, + "learning_rate": 4.34411013672093e-06, + "loss": 0.7, + "step": 1916 + }, + { + "epoch": 0.9517190021099665, + "grad_norm": 0.1417039402628044, + "learning_rate": 4.343450099327207e-06, + "loss": 0.7745, + "step": 1917 + }, + { + "epoch": 0.952215464813206, + "grad_norm": 0.13859569069194763, + "learning_rate": 4.342789780191693e-06, + "loss": 0.8001, + "step": 1918 + }, + { + "epoch": 0.9527119275164453, + "grad_norm": 0.14105331622265738, + "learning_rate": 4.342129179415308e-06, + "loss": 0.7568, + "step": 1919 + }, + { + "epoch": 0.9532083902196847, + "grad_norm": 0.13279996199472438, + "learning_rate": 4.341468297099014e-06, + "loss": 0.7257, + "step": 1920 + }, + { + "epoch": 0.9537048529229242, + "grad_norm": 0.14360632974813572, + "learning_rate": 4.340807133343817e-06, + "loss": 0.7325, + "step": 1921 + }, + { + "epoch": 0.9542013156261636, + "grad_norm": 0.1388585347465505, + "learning_rate": 4.340145688250766e-06, + "loss": 0.7798, + "step": 1922 + }, + { + "epoch": 0.9546977783294031, + "grad_norm": 0.1432237090077402, + "learning_rate": 4.339483961920949e-06, + "loss": 0.7478, + "step": 1923 + }, + { + "epoch": 0.9551942410326424, + "grad_norm": 0.13728709678976309, + "learning_rate": 4.3388219544555035e-06, + "loss": 0.7208, + "step": 1924 + }, + { + "epoch": 0.9556907037358818, + "grad_norm": 0.138040404421968, + "learning_rate": 4.338159665955605e-06, + "loss": 0.7527, + "step": 1925 + }, + { + "epoch": 0.9561871664391213, + "grad_norm": 0.13539548550777614, + "learning_rate": 4.337497096522474e-06, + "loss": 0.759, + "step": 1926 + }, + { + "epoch": 0.9566836291423607, + "grad_norm": 0.14360137088724848, + "learning_rate": 4.336834246257374e-06, + "loss": 0.7671, + "step": 1927 + }, + { + "epoch": 0.9571800918456, + "grad_norm": 0.13929702076400688, + "learning_rate": 4.336171115261611e-06, + "loss": 0.7567, + "step": 1928 + }, + { + "epoch": 0.9576765545488395, + "grad_norm": 0.13535723942569114, + "learning_rate": 4.335507703636533e-06, + "loss": 0.7778, + "step": 1929 + }, + { + "epoch": 0.9581730172520789, + "grad_norm": 0.1388756684946805, + "learning_rate": 4.334844011483534e-06, + "loss": 0.7776, + "step": 1930 + }, + { + "epoch": 0.9586694799553184, + "grad_norm": 0.1483955001430376, + "learning_rate": 4.3341800389040465e-06, + "loss": 0.791, + "step": 1931 + }, + { + "epoch": 0.9591659426585578, + "grad_norm": 0.13859104875120595, + "learning_rate": 4.333515785999549e-06, + "loss": 0.7262, + "step": 1932 + }, + { + "epoch": 0.9596624053617971, + "grad_norm": 0.14028837341254094, + "learning_rate": 4.3328512528715624e-06, + "loss": 0.7514, + "step": 1933 + }, + { + "epoch": 0.9601588680650366, + "grad_norm": 0.13979863885496932, + "learning_rate": 4.332186439621649e-06, + "loss": 0.7499, + "step": 1934 + }, + { + "epoch": 0.960655330768276, + "grad_norm": 0.1364911188988977, + "learning_rate": 4.331521346351415e-06, + "loss": 0.7733, + "step": 1935 + }, + { + "epoch": 0.9611517934715155, + "grad_norm": 0.13731108508080173, + "learning_rate": 4.330855973162509e-06, + "loss": 0.7811, + "step": 1936 + }, + { + "epoch": 0.9616482561747549, + "grad_norm": 0.13586218904438216, + "learning_rate": 4.330190320156623e-06, + "loss": 0.7125, + "step": 1937 + }, + { + "epoch": 0.9621447188779942, + "grad_norm": 0.14068018909196964, + "learning_rate": 4.329524387435493e-06, + "loss": 0.7356, + "step": 1938 + }, + { + "epoch": 0.9626411815812337, + "grad_norm": 0.13818425959230285, + "learning_rate": 4.328858175100893e-06, + "loss": 0.803, + "step": 1939 + }, + { + "epoch": 0.9631376442844731, + "grad_norm": 0.13682184061425626, + "learning_rate": 4.328191683254646e-06, + "loss": 0.7409, + "step": 1940 + }, + { + "epoch": 0.9636341069877126, + "grad_norm": 0.13139663700635784, + "learning_rate": 4.327524911998611e-06, + "loss": 0.7198, + "step": 1941 + }, + { + "epoch": 0.964130569690952, + "grad_norm": 0.13247474416573432, + "learning_rate": 4.326857861434697e-06, + "loss": 0.7251, + "step": 1942 + }, + { + "epoch": 0.9646270323941913, + "grad_norm": 0.13908703323534935, + "learning_rate": 4.326190531664849e-06, + "loss": 0.7511, + "step": 1943 + }, + { + "epoch": 0.9651234950974308, + "grad_norm": 0.13675826493632884, + "learning_rate": 4.32552292279106e-06, + "loss": 0.7485, + "step": 1944 + }, + { + "epoch": 0.9656199578006702, + "grad_norm": 0.13134376978813925, + "learning_rate": 4.3248550349153614e-06, + "loss": 0.6937, + "step": 1945 + }, + { + "epoch": 0.9661164205039097, + "grad_norm": 0.13386771511134407, + "learning_rate": 4.324186868139831e-06, + "loss": 0.7163, + "step": 1946 + }, + { + "epoch": 0.9666128832071491, + "grad_norm": 0.14332833450437155, + "learning_rate": 4.323518422566586e-06, + "loss": 0.7213, + "step": 1947 + }, + { + "epoch": 0.9671093459103884, + "grad_norm": 0.1420284438393322, + "learning_rate": 4.322849698297787e-06, + "loss": 0.7476, + "step": 1948 + }, + { + "epoch": 0.9676058086136279, + "grad_norm": 0.13709550107289373, + "learning_rate": 4.322180695435641e-06, + "loss": 0.7763, + "step": 1949 + }, + { + "epoch": 0.9681022713168673, + "grad_norm": 0.14649727566446094, + "learning_rate": 4.32151141408239e-06, + "loss": 0.7886, + "step": 1950 + }, + { + "epoch": 0.9685987340201068, + "grad_norm": 0.13158072122538214, + "learning_rate": 4.320841854340327e-06, + "loss": 0.7063, + "step": 1951 + }, + { + "epoch": 0.9690951967233462, + "grad_norm": 0.13262016397878115, + "learning_rate": 4.32017201631178e-06, + "loss": 0.6991, + "step": 1952 + }, + { + "epoch": 0.9695916594265855, + "grad_norm": 0.14641301035110524, + "learning_rate": 4.319501900099125e-06, + "loss": 0.7397, + "step": 1953 + }, + { + "epoch": 0.970088122129825, + "grad_norm": 0.1372723043098086, + "learning_rate": 4.318831505804778e-06, + "loss": 0.7251, + "step": 1954 + }, + { + "epoch": 0.9705845848330644, + "grad_norm": 0.13338014846503463, + "learning_rate": 4.318160833531199e-06, + "loss": 0.6907, + "step": 1955 + }, + { + "epoch": 0.9710810475363039, + "grad_norm": 0.15322411141967948, + "learning_rate": 4.317489883380887e-06, + "loss": 0.7292, + "step": 1956 + }, + { + "epoch": 0.9715775102395433, + "grad_norm": 0.14101937816845833, + "learning_rate": 4.3168186554563885e-06, + "loss": 0.7558, + "step": 1957 + }, + { + "epoch": 0.9720739729427826, + "grad_norm": 0.14044388792373283, + "learning_rate": 4.316147149860289e-06, + "loss": 0.7488, + "step": 1958 + }, + { + "epoch": 0.9725704356460221, + "grad_norm": 0.13087099733521357, + "learning_rate": 4.315475366695217e-06, + "loss": 0.7315, + "step": 1959 + }, + { + "epoch": 0.9730668983492615, + "grad_norm": 0.13481714845223355, + "learning_rate": 4.314803306063845e-06, + "loss": 0.7712, + "step": 1960 + }, + { + "epoch": 0.973563361052501, + "grad_norm": 0.14067827771924268, + "learning_rate": 4.3141309680688845e-06, + "loss": 0.768, + "step": 1961 + }, + { + "epoch": 0.9740598237557403, + "grad_norm": 0.15976691205185806, + "learning_rate": 4.313458352813093e-06, + "loss": 0.7287, + "step": 1962 + }, + { + "epoch": 0.9745562864589797, + "grad_norm": 0.15233463929685187, + "learning_rate": 4.31278546039927e-06, + "loss": 0.7577, + "step": 1963 + }, + { + "epoch": 0.9750527491622192, + "grad_norm": 0.13881182089614136, + "learning_rate": 4.312112290930255e-06, + "loss": 0.7525, + "step": 1964 + }, + { + "epoch": 0.9755492118654586, + "grad_norm": 0.13613919026600058, + "learning_rate": 4.31143884450893e-06, + "loss": 0.7338, + "step": 1965 + }, + { + "epoch": 0.9760456745686981, + "grad_norm": 0.14009747972178313, + "learning_rate": 4.310765121238223e-06, + "loss": 0.7685, + "step": 1966 + }, + { + "epoch": 0.9765421372719374, + "grad_norm": 0.1398735823592921, + "learning_rate": 4.310091121221101e-06, + "loss": 0.7142, + "step": 1967 + }, + { + "epoch": 0.9770385999751768, + "grad_norm": 0.15370727403582576, + "learning_rate": 4.3094168445605735e-06, + "loss": 0.7518, + "step": 1968 + }, + { + "epoch": 0.9775350626784163, + "grad_norm": 0.13221826861513192, + "learning_rate": 4.308742291359692e-06, + "loss": 0.7045, + "step": 1969 + }, + { + "epoch": 0.9780315253816557, + "grad_norm": 0.13298617913810548, + "learning_rate": 4.308067461721553e-06, + "loss": 0.7197, + "step": 1970 + }, + { + "epoch": 0.9785279880848952, + "grad_norm": 0.13310197028851603, + "learning_rate": 4.307392355749293e-06, + "loss": 0.7276, + "step": 1971 + }, + { + "epoch": 0.9790244507881345, + "grad_norm": 0.13441131799419243, + "learning_rate": 4.30671697354609e-06, + "loss": 0.7461, + "step": 1972 + }, + { + "epoch": 0.9795209134913739, + "grad_norm": 0.13598707003248428, + "learning_rate": 4.306041315215167e-06, + "loss": 0.751, + "step": 1973 + }, + { + "epoch": 0.9800173761946134, + "grad_norm": 0.13020590326743284, + "learning_rate": 4.305365380859786e-06, + "loss": 0.7471, + "step": 1974 + }, + { + "epoch": 0.9805138388978528, + "grad_norm": 0.13548633327801685, + "learning_rate": 4.304689170583254e-06, + "loss": 0.7304, + "step": 1975 + }, + { + "epoch": 0.9810103016010923, + "grad_norm": 0.14147295011534075, + "learning_rate": 4.304012684488917e-06, + "loss": 0.7926, + "step": 1976 + }, + { + "epoch": 0.9815067643043316, + "grad_norm": 0.14704693808375974, + "learning_rate": 4.303335922680167e-06, + "loss": 0.7629, + "step": 1977 + }, + { + "epoch": 0.982003227007571, + "grad_norm": 0.135220427389001, + "learning_rate": 4.302658885260436e-06, + "loss": 0.7305, + "step": 1978 + }, + { + "epoch": 0.9824996897108105, + "grad_norm": 0.13439053824847158, + "learning_rate": 4.301981572333197e-06, + "loss": 0.7752, + "step": 1979 + }, + { + "epoch": 0.9829961524140499, + "grad_norm": 0.13771634609127179, + "learning_rate": 4.3013039840019675e-06, + "loss": 0.7251, + "step": 1980 + }, + { + "epoch": 0.9834926151172894, + "grad_norm": 0.1326406234325998, + "learning_rate": 4.300626120370306e-06, + "loss": 0.7595, + "step": 1981 + }, + { + "epoch": 0.9839890778205287, + "grad_norm": 0.14246037641515524, + "learning_rate": 4.2999479815418135e-06, + "loss": 0.7618, + "step": 1982 + }, + { + "epoch": 0.9844855405237681, + "grad_norm": 0.14003774333863, + "learning_rate": 4.299269567620131e-06, + "loss": 0.7133, + "step": 1983 + }, + { + "epoch": 0.9849820032270076, + "grad_norm": 0.13443584919833715, + "learning_rate": 4.2985908787089445e-06, + "loss": 0.7722, + "step": 1984 + }, + { + "epoch": 0.985478465930247, + "grad_norm": 0.13668575653481171, + "learning_rate": 4.29791191491198e-06, + "loss": 0.7583, + "step": 1985 + }, + { + "epoch": 0.9859749286334865, + "grad_norm": 0.14279910896798906, + "learning_rate": 4.297232676333007e-06, + "loss": 0.7981, + "step": 1986 + }, + { + "epoch": 0.9864713913367258, + "grad_norm": 0.14161222593397266, + "learning_rate": 4.296553163075836e-06, + "loss": 0.7594, + "step": 1987 + }, + { + "epoch": 0.9869678540399652, + "grad_norm": 0.13831563618388676, + "learning_rate": 4.295873375244319e-06, + "loss": 0.6993, + "step": 1988 + }, + { + "epoch": 0.9874643167432047, + "grad_norm": 0.13787046565146302, + "learning_rate": 4.2951933129423515e-06, + "loss": 0.7358, + "step": 1989 + }, + { + "epoch": 0.9879607794464441, + "grad_norm": 0.13779555768214863, + "learning_rate": 4.29451297627387e-06, + "loss": 0.7178, + "step": 1990 + }, + { + "epoch": 0.9884572421496836, + "grad_norm": 0.13217653339807073, + "learning_rate": 4.293832365342853e-06, + "loss": 0.7439, + "step": 1991 + }, + { + "epoch": 0.9889537048529229, + "grad_norm": 0.15651907445083008, + "learning_rate": 4.293151480253321e-06, + "loss": 0.7852, + "step": 1992 + }, + { + "epoch": 0.9894501675561623, + "grad_norm": 0.14035753112824711, + "learning_rate": 4.292470321109336e-06, + "loss": 0.8147, + "step": 1993 + }, + { + "epoch": 0.9899466302594018, + "grad_norm": 0.1344265290214628, + "learning_rate": 4.291788888015002e-06, + "loss": 0.7576, + "step": 1994 + }, + { + "epoch": 0.9904430929626412, + "grad_norm": 0.13379707071517158, + "learning_rate": 4.291107181074466e-06, + "loss": 0.7163, + "step": 1995 + }, + { + "epoch": 0.9909395556658807, + "grad_norm": 0.1399492515733686, + "learning_rate": 4.290425200391917e-06, + "loss": 0.7922, + "step": 1996 + }, + { + "epoch": 0.99143601836912, + "grad_norm": 0.14344554049611216, + "learning_rate": 4.289742946071581e-06, + "loss": 0.7665, + "step": 1997 + }, + { + "epoch": 0.9919324810723594, + "grad_norm": 0.1382864134304432, + "learning_rate": 4.2890604182177336e-06, + "loss": 0.7557, + "step": 1998 + }, + { + "epoch": 0.9924289437755989, + "grad_norm": 0.14309914226914516, + "learning_rate": 4.288377616934686e-06, + "loss": 0.7213, + "step": 1999 + }, + { + "epoch": 0.9929254064788383, + "grad_norm": 0.13717535026025166, + "learning_rate": 4.287694542326795e-06, + "loss": 0.7113, + "step": 2000 + }, + { + "epoch": 0.9934218691820776, + "grad_norm": 0.13746827797324848, + "learning_rate": 4.287011194498456e-06, + "loss": 0.7891, + "step": 2001 + }, + { + "epoch": 0.9939183318853171, + "grad_norm": 0.1421486270249576, + "learning_rate": 4.2863275735541085e-06, + "loss": 0.737, + "step": 2002 + }, + { + "epoch": 0.9944147945885565, + "grad_norm": 0.13682547516347832, + "learning_rate": 4.285643679598233e-06, + "loss": 0.7373, + "step": 2003 + }, + { + "epoch": 0.994911257291796, + "grad_norm": 0.1266137711385144, + "learning_rate": 4.284959512735352e-06, + "loss": 0.6983, + "step": 2004 + }, + { + "epoch": 0.9954077199950354, + "grad_norm": 0.14377205813881055, + "learning_rate": 4.284275073070028e-06, + "loss": 0.7426, + "step": 2005 + }, + { + "epoch": 0.9959041826982747, + "grad_norm": 0.1350794657949948, + "learning_rate": 4.283590360706868e-06, + "loss": 0.7668, + "step": 2006 + }, + { + "epoch": 0.9964006454015142, + "grad_norm": 0.14250458148212633, + "learning_rate": 4.28290537575052e-06, + "loss": 0.8081, + "step": 2007 + }, + { + "epoch": 0.9968971081047536, + "grad_norm": 0.13575371037098183, + "learning_rate": 4.282220118305672e-06, + "loss": 0.7055, + "step": 2008 + }, + { + "epoch": 0.9973935708079931, + "grad_norm": 0.13942020771733923, + "learning_rate": 4.281534588477054e-06, + "loss": 0.7798, + "step": 2009 + }, + { + "epoch": 0.9978900335112325, + "grad_norm": 0.13808731679650627, + "learning_rate": 4.280848786369439e-06, + "loss": 0.7241, + "step": 2010 + }, + { + "epoch": 0.9983864962144718, + "grad_norm": 0.14200950616113964, + "learning_rate": 4.280162712087641e-06, + "loss": 0.7211, + "step": 2011 + }, + { + "epoch": 0.9988829589177113, + "grad_norm": 0.13227471958984183, + "learning_rate": 4.2794763657365155e-06, + "loss": 0.7434, + "step": 2012 + }, + { + "epoch": 0.9993794216209507, + "grad_norm": 0.1506748510159815, + "learning_rate": 4.278789747420959e-06, + "loss": 0.772, + "step": 2013 + }, + { + "epoch": 0.9998758843241902, + "grad_norm": 0.1692117125114903, + "learning_rate": 4.27810285724591e-06, + "loss": 0.8515, + "step": 2014 + }, + { + "epoch": 1.0, + "grad_norm": 0.1692117125114903, + "learning_rate": 4.27741569531635e-06, + "loss": 0.1869, + "step": 2015 + }, + { + "epoch": 1.0003723470274295, + "grad_norm": 0.13641143847300066, + "learning_rate": 4.276728261737298e-06, + "loss": 0.5693, + "step": 2016 + }, + { + "epoch": 1.0003723470274295, + "eval_loss": 0.7493109107017517, + "eval_runtime": 135.5139, + "eval_samples_per_second": 223.984, + "eval_steps_per_second": 28.005, + "step": 2016 + }, + { + "epoch": 1.0004964627032393, + "grad_norm": 0.14993340674591726, + "learning_rate": 4.27604055661382e-06, + "loss": 0.741, + "step": 2017 + }, + { + "epoch": 1.0009929254064789, + "grad_norm": 0.15136956647535096, + "learning_rate": 4.275352580051019e-06, + "loss": 0.7305, + "step": 2018 + }, + { + "epoch": 1.0014893881097182, + "grad_norm": 0.13556367994060173, + "learning_rate": 4.274664332154042e-06, + "loss": 0.6991, + "step": 2019 + }, + { + "epoch": 1.0019858508129578, + "grad_norm": 0.13474503555709075, + "learning_rate": 4.273975813028076e-06, + "loss": 0.7061, + "step": 2020 + }, + { + "epoch": 1.002482313516197, + "grad_norm": 0.13926546524904157, + "learning_rate": 4.273287022778351e-06, + "loss": 0.7501, + "step": 2021 + }, + { + "epoch": 1.0029787762194364, + "grad_norm": 0.1417176043167176, + "learning_rate": 4.272597961510137e-06, + "loss": 0.7204, + "step": 2022 + }, + { + "epoch": 1.003475238922676, + "grad_norm": 0.1419815927815163, + "learning_rate": 4.271908629328747e-06, + "loss": 0.7269, + "step": 2023 + }, + { + "epoch": 1.0039717016259153, + "grad_norm": 0.14837462280399044, + "learning_rate": 4.2712190263395315e-06, + "loss": 0.7593, + "step": 2024 + }, + { + "epoch": 1.0044681643291549, + "grad_norm": 0.1424958004520062, + "learning_rate": 4.270529152647889e-06, + "loss": 0.7987, + "step": 2025 + }, + { + "epoch": 1.0049646270323942, + "grad_norm": 0.13557327739425565, + "learning_rate": 4.269839008359252e-06, + "loss": 0.7095, + "step": 2026 + }, + { + "epoch": 1.0054610897356335, + "grad_norm": 0.1350892189012179, + "learning_rate": 4.269148593579101e-06, + "loss": 0.7297, + "step": 2027 + }, + { + "epoch": 1.005957552438873, + "grad_norm": 0.1431149142071869, + "learning_rate": 4.268457908412953e-06, + "loss": 0.7074, + "step": 2028 + }, + { + "epoch": 1.0064540151421124, + "grad_norm": 0.13418266947538876, + "learning_rate": 4.267766952966369e-06, + "loss": 0.7235, + "step": 2029 + }, + { + "epoch": 1.006950477845352, + "grad_norm": 0.131551107709248, + "learning_rate": 4.267075727344951e-06, + "loss": 0.7286, + "step": 2030 + }, + { + "epoch": 1.0074469405485913, + "grad_norm": 0.13666318482495718, + "learning_rate": 4.266384231654339e-06, + "loss": 0.7225, + "step": 2031 + }, + { + "epoch": 1.0079434032518306, + "grad_norm": 0.14957531887872869, + "learning_rate": 4.265692466000221e-06, + "loss": 0.7452, + "step": 2032 + }, + { + "epoch": 1.0084398659550702, + "grad_norm": 0.1352458457312148, + "learning_rate": 4.2650004304883195e-06, + "loss": 0.7488, + "step": 2033 + }, + { + "epoch": 1.0089363286583095, + "grad_norm": 0.13431771924046446, + "learning_rate": 4.2643081252244024e-06, + "loss": 0.7408, + "step": 2034 + }, + { + "epoch": 1.009432791361549, + "grad_norm": 0.1404702073381516, + "learning_rate": 4.263615550314276e-06, + "loss": 0.7136, + "step": 2035 + }, + { + "epoch": 1.0099292540647884, + "grad_norm": 0.14333737075703962, + "learning_rate": 4.262922705863791e-06, + "loss": 0.7315, + "step": 2036 + }, + { + "epoch": 1.0104257167680277, + "grad_norm": 0.13931858496722177, + "learning_rate": 4.262229591978836e-06, + "loss": 0.7515, + "step": 2037 + }, + { + "epoch": 1.0109221794712673, + "grad_norm": 0.14111176709272, + "learning_rate": 4.261536208765343e-06, + "loss": 0.685, + "step": 2038 + }, + { + "epoch": 1.0114186421745066, + "grad_norm": 0.13873104450186644, + "learning_rate": 4.260842556329285e-06, + "loss": 0.7419, + "step": 2039 + }, + { + "epoch": 1.0119151048777462, + "grad_norm": 0.14733619687470917, + "learning_rate": 4.260148634776675e-06, + "loss": 0.745, + "step": 2040 + }, + { + "epoch": 1.0124115675809855, + "grad_norm": 0.13985004223191186, + "learning_rate": 4.259454444213568e-06, + "loss": 0.7246, + "step": 2041 + }, + { + "epoch": 1.0129080302842248, + "grad_norm": 0.13995194992924537, + "learning_rate": 4.25875998474606e-06, + "loss": 0.7642, + "step": 2042 + }, + { + "epoch": 1.0134044929874644, + "grad_norm": 0.1362055125185364, + "learning_rate": 4.258065256480288e-06, + "loss": 0.7341, + "step": 2043 + }, + { + "epoch": 1.0139009556907037, + "grad_norm": 0.1422465441519774, + "learning_rate": 4.25737025952243e-06, + "loss": 0.7266, + "step": 2044 + }, + { + "epoch": 1.0143974183939433, + "grad_norm": 0.13512192671393297, + "learning_rate": 4.2566749939787056e-06, + "loss": 0.7098, + "step": 2045 + }, + { + "epoch": 1.0148938810971826, + "grad_norm": 0.13453521708159794, + "learning_rate": 4.255979459955374e-06, + "loss": 0.6598, + "step": 2046 + }, + { + "epoch": 1.015390343800422, + "grad_norm": 0.1458759220859774, + "learning_rate": 4.255283657558736e-06, + "loss": 0.7255, + "step": 2047 + }, + { + "epoch": 1.0158868065036615, + "grad_norm": 0.13826399594709918, + "learning_rate": 4.2545875868951355e-06, + "loss": 0.6747, + "step": 2048 + }, + { + "epoch": 1.0163832692069008, + "grad_norm": 0.15055189765595503, + "learning_rate": 4.253891248070956e-06, + "loss": 0.7719, + "step": 2049 + }, + { + "epoch": 1.0168797319101404, + "grad_norm": 0.13416248781345586, + "learning_rate": 4.253194641192621e-06, + "loss": 0.6815, + "step": 2050 + }, + { + "epoch": 1.0173761946133797, + "grad_norm": 0.13559489599334676, + "learning_rate": 4.252497766366593e-06, + "loss": 0.7037, + "step": 2051 + }, + { + "epoch": 1.017872657316619, + "grad_norm": 0.172228917141983, + "learning_rate": 4.251800623699382e-06, + "loss": 0.7434, + "step": 2052 + }, + { + "epoch": 1.0183691200198586, + "grad_norm": 0.1370557981462678, + "learning_rate": 4.251103213297534e-06, + "loss": 0.724, + "step": 2053 + }, + { + "epoch": 1.018865582723098, + "grad_norm": 0.13629385144601702, + "learning_rate": 4.250405535267636e-06, + "loss": 0.753, + "step": 2054 + }, + { + "epoch": 1.0193620454263372, + "grad_norm": 0.13309228573779064, + "learning_rate": 4.249707589716318e-06, + "loss": 0.7319, + "step": 2055 + }, + { + "epoch": 1.0198585081295768, + "grad_norm": 0.13715514685240124, + "learning_rate": 4.24900937675025e-06, + "loss": 0.7897, + "step": 2056 + }, + { + "epoch": 1.0203549708328161, + "grad_norm": 0.1357669090684218, + "learning_rate": 4.24831089647614e-06, + "loss": 0.7267, + "step": 2057 + }, + { + "epoch": 1.0208514335360557, + "grad_norm": 0.1498727514671048, + "learning_rate": 4.247612149000743e-06, + "loss": 0.7297, + "step": 2058 + }, + { + "epoch": 1.021347896239295, + "grad_norm": 0.14049047303313542, + "learning_rate": 4.24691313443085e-06, + "loss": 0.7304, + "step": 2059 + }, + { + "epoch": 1.0218443589425343, + "grad_norm": 0.13909453659414045, + "learning_rate": 4.2462138528732935e-06, + "loss": 0.7048, + "step": 2060 + }, + { + "epoch": 1.022340821645774, + "grad_norm": 0.13412019436547604, + "learning_rate": 4.245514304434948e-06, + "loss": 0.6914, + "step": 2061 + }, + { + "epoch": 1.0228372843490132, + "grad_norm": 0.1391340532915626, + "learning_rate": 4.244814489222728e-06, + "loss": 0.7017, + "step": 2062 + }, + { + "epoch": 1.0233337470522528, + "grad_norm": 0.14295513021033426, + "learning_rate": 4.244114407343589e-06, + "loss": 0.7457, + "step": 2063 + }, + { + "epoch": 1.023830209755492, + "grad_norm": 0.13900941541232267, + "learning_rate": 4.2434140589045286e-06, + "loss": 0.7401, + "step": 2064 + }, + { + "epoch": 1.0243266724587314, + "grad_norm": 0.13944892230939118, + "learning_rate": 4.242713444012583e-06, + "loss": 0.7874, + "step": 2065 + }, + { + "epoch": 1.024823135161971, + "grad_norm": 0.13593208488424774, + "learning_rate": 4.242012562774829e-06, + "loss": 0.7239, + "step": 2066 + }, + { + "epoch": 1.0253195978652103, + "grad_norm": 0.13931000983919048, + "learning_rate": 4.241311415298386e-06, + "loss": 0.7454, + "step": 2067 + }, + { + "epoch": 1.0258160605684499, + "grad_norm": 0.1330003858692133, + "learning_rate": 4.240610001690413e-06, + "loss": 0.734, + "step": 2068 + }, + { + "epoch": 1.0263125232716892, + "grad_norm": 0.14108224227175165, + "learning_rate": 4.239908322058109e-06, + "loss": 0.6962, + "step": 2069 + }, + { + "epoch": 1.0268089859749285, + "grad_norm": 0.1391861450047936, + "learning_rate": 4.239206376508716e-06, + "loss": 0.7008, + "step": 2070 + }, + { + "epoch": 1.027305448678168, + "grad_norm": 0.1370473901313178, + "learning_rate": 4.238504165149515e-06, + "loss": 0.7544, + "step": 2071 + }, + { + "epoch": 1.0278019113814074, + "grad_norm": 0.13536037419786456, + "learning_rate": 4.237801688087827e-06, + "loss": 0.7369, + "step": 2072 + }, + { + "epoch": 1.028298374084647, + "grad_norm": 0.14978518639680113, + "learning_rate": 4.237098945431014e-06, + "loss": 0.7679, + "step": 2073 + }, + { + "epoch": 1.0287948367878863, + "grad_norm": 0.13912767164252948, + "learning_rate": 4.236395937286479e-06, + "loss": 0.7166, + "step": 2074 + }, + { + "epoch": 1.0292912994911256, + "grad_norm": 0.1386818465495855, + "learning_rate": 4.2356926637616665e-06, + "loss": 0.7385, + "step": 2075 + }, + { + "epoch": 1.0297877621943652, + "grad_norm": 0.1411775085091878, + "learning_rate": 4.234989124964061e-06, + "loss": 0.7256, + "step": 2076 + }, + { + "epoch": 1.0302842248976045, + "grad_norm": 0.14419739467783685, + "learning_rate": 4.234285321001185e-06, + "loss": 0.7371, + "step": 2077 + }, + { + "epoch": 1.030780687600844, + "grad_norm": 0.13803664666766124, + "learning_rate": 4.233581251980604e-06, + "loss": 0.7746, + "step": 2078 + }, + { + "epoch": 1.0312771503040834, + "grad_norm": 0.14388287736016128, + "learning_rate": 4.2328769180099265e-06, + "loss": 0.7547, + "step": 2079 + }, + { + "epoch": 1.0317736130073227, + "grad_norm": 0.14306598998666248, + "learning_rate": 4.232172319196795e-06, + "loss": 0.7117, + "step": 2080 + }, + { + "epoch": 1.0322700757105623, + "grad_norm": 0.14932389713396255, + "learning_rate": 4.231467455648899e-06, + "loss": 0.7507, + "step": 2081 + }, + { + "epoch": 1.0327665384138016, + "grad_norm": 0.14803637284334528, + "learning_rate": 4.230762327473964e-06, + "loss": 0.7168, + "step": 2082 + }, + { + "epoch": 1.0332630011170412, + "grad_norm": 0.14111484516048733, + "learning_rate": 4.2300569347797584e-06, + "loss": 0.73, + "step": 2083 + }, + { + "epoch": 1.0337594638202805, + "grad_norm": 0.13730617408120754, + "learning_rate": 4.229351277674088e-06, + "loss": 0.7295, + "step": 2084 + }, + { + "epoch": 1.0342559265235198, + "grad_norm": 0.14834889202516877, + "learning_rate": 4.228645356264805e-06, + "loss": 0.8029, + "step": 2085 + }, + { + "epoch": 1.0347523892267594, + "grad_norm": 0.14599972266177927, + "learning_rate": 4.227939170659795e-06, + "loss": 0.7629, + "step": 2086 + }, + { + "epoch": 1.0352488519299987, + "grad_norm": 0.14092123214072694, + "learning_rate": 4.227232720966988e-06, + "loss": 0.7107, + "step": 2087 + }, + { + "epoch": 1.0357453146332383, + "grad_norm": 0.14122940427509462, + "learning_rate": 4.226526007294353e-06, + "loss": 0.7213, + "step": 2088 + }, + { + "epoch": 1.0362417773364776, + "grad_norm": 0.13842815838317993, + "learning_rate": 4.225819029749902e-06, + "loss": 0.7574, + "step": 2089 + }, + { + "epoch": 1.036738240039717, + "grad_norm": 0.13969707819774496, + "learning_rate": 4.225111788441682e-06, + "loss": 0.7901, + "step": 2090 + }, + { + "epoch": 1.0372347027429565, + "grad_norm": 0.14454482563554347, + "learning_rate": 4.224404283477788e-06, + "loss": 0.7426, + "step": 2091 + }, + { + "epoch": 1.0377311654461958, + "grad_norm": 0.1589968114041004, + "learning_rate": 4.223696514966346e-06, + "loss": 0.757, + "step": 2092 + }, + { + "epoch": 1.0382276281494354, + "grad_norm": 0.14630132012052294, + "learning_rate": 4.2229884830155285e-06, + "loss": 0.7052, + "step": 2093 + }, + { + "epoch": 1.0387240908526747, + "grad_norm": 0.13974708317715842, + "learning_rate": 4.222280187733549e-06, + "loss": 0.7687, + "step": 2094 + }, + { + "epoch": 1.039220553555914, + "grad_norm": 0.1383464362903483, + "learning_rate": 4.2215716292286555e-06, + "loss": 0.7048, + "step": 2095 + }, + { + "epoch": 1.0397170162591536, + "grad_norm": 0.14009397553084701, + "learning_rate": 4.220862807609144e-06, + "loss": 0.6834, + "step": 2096 + }, + { + "epoch": 1.040213478962393, + "grad_norm": 0.14079336307045506, + "learning_rate": 4.220153722983342e-06, + "loss": 0.7392, + "step": 2097 + }, + { + "epoch": 1.0407099416656325, + "grad_norm": 0.13787553354880905, + "learning_rate": 4.219444375459626e-06, + "loss": 0.7496, + "step": 2098 + }, + { + "epoch": 1.0412064043688718, + "grad_norm": 0.1400054541103246, + "learning_rate": 4.218734765146405e-06, + "loss": 0.7155, + "step": 2099 + }, + { + "epoch": 1.0417028670721111, + "grad_norm": 0.13722004560528184, + "learning_rate": 4.218024892152134e-06, + "loss": 0.736, + "step": 2100 + }, + { + "epoch": 1.0421993297753507, + "grad_norm": 0.13237548806271895, + "learning_rate": 4.217314756585305e-06, + "loss": 0.6985, + "step": 2101 + }, + { + "epoch": 1.04269579247859, + "grad_norm": 0.13814683970889546, + "learning_rate": 4.2166043585544495e-06, + "loss": 0.742, + "step": 2102 + }, + { + "epoch": 1.0431922551818296, + "grad_norm": 0.1519709587964635, + "learning_rate": 4.2158936981681415e-06, + "loss": 0.7286, + "step": 2103 + }, + { + "epoch": 1.043688717885069, + "grad_norm": 0.13343117788795786, + "learning_rate": 4.215182775534994e-06, + "loss": 0.7769, + "step": 2104 + }, + { + "epoch": 1.0441851805883082, + "grad_norm": 0.14132688675642163, + "learning_rate": 4.21447159076366e-06, + "loss": 0.7645, + "step": 2105 + }, + { + "epoch": 1.0446816432915478, + "grad_norm": 0.142210221906363, + "learning_rate": 4.213760143962834e-06, + "loss": 0.7481, + "step": 2106 + }, + { + "epoch": 1.045178105994787, + "grad_norm": 0.1374760586531246, + "learning_rate": 4.2130484352412475e-06, + "loss": 0.6909, + "step": 2107 + }, + { + "epoch": 1.0456745686980267, + "grad_norm": 0.1309015889916558, + "learning_rate": 4.212336464707674e-06, + "loss": 0.7001, + "step": 2108 + }, + { + "epoch": 1.046171031401266, + "grad_norm": 0.13676033591829806, + "learning_rate": 4.211624232470927e-06, + "loss": 0.7778, + "step": 2109 + }, + { + "epoch": 1.0466674941045053, + "grad_norm": 0.13760414687569777, + "learning_rate": 4.2109117386398595e-06, + "loss": 0.7199, + "step": 2110 + }, + { + "epoch": 1.0471639568077449, + "grad_norm": 0.14075911332828087, + "learning_rate": 4.210198983323366e-06, + "loss": 0.7741, + "step": 2111 + }, + { + "epoch": 1.0476604195109842, + "grad_norm": 0.13356672723592797, + "learning_rate": 4.209485966630377e-06, + "loss": 0.7253, + "step": 2112 + }, + { + "epoch": 1.0481568822142238, + "grad_norm": 0.14150649799028853, + "learning_rate": 4.2087726886698695e-06, + "loss": 0.7779, + "step": 2113 + }, + { + "epoch": 1.048653344917463, + "grad_norm": 0.13823807245765327, + "learning_rate": 4.208059149550855e-06, + "loss": 0.741, + "step": 2114 + }, + { + "epoch": 1.0491498076207024, + "grad_norm": 0.14366242195045892, + "learning_rate": 4.207345349382385e-06, + "loss": 0.7521, + "step": 2115 + }, + { + "epoch": 1.049646270323942, + "grad_norm": 0.13980640222600257, + "learning_rate": 4.206631288273554e-06, + "loss": 0.7132, + "step": 2116 + }, + { + "epoch": 1.0501427330271813, + "grad_norm": 0.14845761168836935, + "learning_rate": 4.205916966333494e-06, + "loss": 0.7072, + "step": 2117 + }, + { + "epoch": 1.0506391957304206, + "grad_norm": 0.14139479483348766, + "learning_rate": 4.2052023836713785e-06, + "loss": 0.7411, + "step": 2118 + }, + { + "epoch": 1.0511356584336602, + "grad_norm": 0.14059968450149515, + "learning_rate": 4.204487540396419e-06, + "loss": 0.691, + "step": 2119 + }, + { + "epoch": 1.0516321211368995, + "grad_norm": 0.1362156286798585, + "learning_rate": 4.203772436617868e-06, + "loss": 0.7444, + "step": 2120 + }, + { + "epoch": 1.052128583840139, + "grad_norm": 0.13831235259558317, + "learning_rate": 4.203057072445019e-06, + "loss": 0.7317, + "step": 2121 + }, + { + "epoch": 1.0526250465433784, + "grad_norm": 0.14207834501053565, + "learning_rate": 4.202341447987202e-06, + "loss": 0.7359, + "step": 2122 + }, + { + "epoch": 1.0531215092466177, + "grad_norm": 0.13778286488933011, + "learning_rate": 4.201625563353791e-06, + "loss": 0.7756, + "step": 2123 + }, + { + "epoch": 1.0536179719498573, + "grad_norm": 0.13980048084090257, + "learning_rate": 4.200909418654194e-06, + "loss": 0.7331, + "step": 2124 + }, + { + "epoch": 1.0541144346530966, + "grad_norm": 0.13997426427628695, + "learning_rate": 4.200193013997866e-06, + "loss": 0.6935, + "step": 2125 + }, + { + "epoch": 1.0546108973563362, + "grad_norm": 0.1459212095265043, + "learning_rate": 4.199476349494296e-06, + "loss": 0.7204, + "step": 2126 + }, + { + "epoch": 1.0551073600595755, + "grad_norm": 0.14329178565231715, + "learning_rate": 4.198759425253015e-06, + "loss": 0.7464, + "step": 2127 + }, + { + "epoch": 1.0556038227628148, + "grad_norm": 0.1422075716131861, + "learning_rate": 4.198042241383593e-06, + "loss": 0.7486, + "step": 2128 + }, + { + "epoch": 1.0561002854660544, + "grad_norm": 0.14092996648947848, + "learning_rate": 4.19732479799564e-06, + "loss": 0.7074, + "step": 2129 + }, + { + "epoch": 1.0565967481692937, + "grad_norm": 0.14086274951921257, + "learning_rate": 4.196607095198808e-06, + "loss": 0.7577, + "step": 2130 + }, + { + "epoch": 1.0570932108725333, + "grad_norm": 0.13776776723760562, + "learning_rate": 4.1958891331027826e-06, + "loss": 0.7324, + "step": 2131 + }, + { + "epoch": 1.0575896735757726, + "grad_norm": 0.13566064448922902, + "learning_rate": 4.1951709118172954e-06, + "loss": 0.7626, + "step": 2132 + }, + { + "epoch": 1.058086136279012, + "grad_norm": 0.13763795010324914, + "learning_rate": 4.194452431452115e-06, + "loss": 0.7537, + "step": 2133 + }, + { + "epoch": 1.0585825989822515, + "grad_norm": 0.1459065764808873, + "learning_rate": 4.193733692117048e-06, + "loss": 0.7355, + "step": 2134 + }, + { + "epoch": 1.0590790616854908, + "grad_norm": 0.13884201088984038, + "learning_rate": 4.193014693921944e-06, + "loss": 0.7535, + "step": 2135 + }, + { + "epoch": 1.0595755243887304, + "grad_norm": 0.14001522032769478, + "learning_rate": 4.192295436976688e-06, + "loss": 0.7447, + "step": 2136 + }, + { + "epoch": 1.0600719870919697, + "grad_norm": 0.15172505034245246, + "learning_rate": 4.19157592139121e-06, + "loss": 0.7522, + "step": 2137 + }, + { + "epoch": 1.060568449795209, + "grad_norm": 0.13794840697062816, + "learning_rate": 4.190856147275474e-06, + "loss": 0.6883, + "step": 2138 + }, + { + "epoch": 1.0610649124984486, + "grad_norm": 0.14217691316180742, + "learning_rate": 4.190136114739487e-06, + "loss": 0.727, + "step": 2139 + }, + { + "epoch": 1.061561375201688, + "grad_norm": 0.13583861770554806, + "learning_rate": 4.189415823893293e-06, + "loss": 0.704, + "step": 2140 + }, + { + "epoch": 1.0620578379049275, + "grad_norm": 0.1363645606259115, + "learning_rate": 4.188695274846979e-06, + "loss": 0.7381, + "step": 2141 + }, + { + "epoch": 1.0625543006081668, + "grad_norm": 0.14814722988133758, + "learning_rate": 4.1879744677106685e-06, + "loss": 0.7346, + "step": 2142 + }, + { + "epoch": 1.0630507633114061, + "grad_norm": 0.16059324736898764, + "learning_rate": 4.1872534025945255e-06, + "loss": 0.7899, + "step": 2143 + }, + { + "epoch": 1.0635472260146457, + "grad_norm": 0.14014168016673342, + "learning_rate": 4.186532079608753e-06, + "loss": 0.7599, + "step": 2144 + }, + { + "epoch": 1.064043688717885, + "grad_norm": 0.14674448113708913, + "learning_rate": 4.185810498863592e-06, + "loss": 0.7054, + "step": 2145 + }, + { + "epoch": 1.0645401514211246, + "grad_norm": 0.13552371087730036, + "learning_rate": 4.185088660469328e-06, + "loss": 0.7602, + "step": 2146 + }, + { + "epoch": 1.065036614124364, + "grad_norm": 0.1336555213075988, + "learning_rate": 4.18436656453628e-06, + "loss": 0.7583, + "step": 2147 + }, + { + "epoch": 1.0655330768276032, + "grad_norm": 0.13559730592832767, + "learning_rate": 4.1836442111748086e-06, + "loss": 0.7456, + "step": 2148 + }, + { + "epoch": 1.0660295395308428, + "grad_norm": 0.14992961750694342, + "learning_rate": 4.182921600495316e-06, + "loss": 0.7379, + "step": 2149 + }, + { + "epoch": 1.0665260022340821, + "grad_norm": 0.13935556972523674, + "learning_rate": 4.1821987326082396e-06, + "loss": 0.696, + "step": 2150 + }, + { + "epoch": 1.0670224649373217, + "grad_norm": 0.14083946307481723, + "learning_rate": 4.181475607624059e-06, + "loss": 0.7851, + "step": 2151 + }, + { + "epoch": 1.067518927640561, + "grad_norm": 0.1421578052366813, + "learning_rate": 4.1807522256532925e-06, + "loss": 0.7259, + "step": 2152 + }, + { + "epoch": 1.0680153903438003, + "grad_norm": 0.13548843386534212, + "learning_rate": 4.1800285868064964e-06, + "loss": 0.7304, + "step": 2153 + }, + { + "epoch": 1.0685118530470399, + "grad_norm": 0.13702431747439558, + "learning_rate": 4.179304691194269e-06, + "loss": 0.758, + "step": 2154 + }, + { + "epoch": 1.0690083157502792, + "grad_norm": 0.13318149795074, + "learning_rate": 4.178580538927245e-06, + "loss": 0.6844, + "step": 2155 + }, + { + "epoch": 1.0695047784535188, + "grad_norm": 0.1407081020046659, + "learning_rate": 4.177856130116099e-06, + "loss": 0.7639, + "step": 2156 + }, + { + "epoch": 1.070001241156758, + "grad_norm": 0.1438833118655734, + "learning_rate": 4.177131464871545e-06, + "loss": 0.7006, + "step": 2157 + }, + { + "epoch": 1.0704977038599974, + "grad_norm": 0.1461487574061599, + "learning_rate": 4.176406543304339e-06, + "loss": 0.738, + "step": 2158 + }, + { + "epoch": 1.070994166563237, + "grad_norm": 0.1398230912718051, + "learning_rate": 4.175681365525271e-06, + "loss": 0.6985, + "step": 2159 + }, + { + "epoch": 1.0714906292664763, + "grad_norm": 0.13817741460521907, + "learning_rate": 4.174955931645175e-06, + "loss": 0.7285, + "step": 2160 + }, + { + "epoch": 1.0719870919697159, + "grad_norm": 0.14224093463734003, + "learning_rate": 4.1742302417749205e-06, + "loss": 0.7586, + "step": 2161 + }, + { + "epoch": 1.0724835546729552, + "grad_norm": 0.13835662156431366, + "learning_rate": 4.173504296025417e-06, + "loss": 0.7539, + "step": 2162 + }, + { + "epoch": 1.0729800173761945, + "grad_norm": 0.13861535686149404, + "learning_rate": 4.1727780945076155e-06, + "loss": 0.654, + "step": 2163 + }, + { + "epoch": 1.073476480079434, + "grad_norm": 0.13428993721206128, + "learning_rate": 4.172051637332501e-06, + "loss": 0.7285, + "step": 2164 + }, + { + "epoch": 1.0739729427826734, + "grad_norm": 0.13538857395884993, + "learning_rate": 4.171324924611105e-06, + "loss": 0.7218, + "step": 2165 + }, + { + "epoch": 1.074469405485913, + "grad_norm": 0.1452550536947396, + "learning_rate": 4.170597956454492e-06, + "loss": 0.8168, + "step": 2166 + }, + { + "epoch": 1.0749658681891523, + "grad_norm": 0.13624518253158813, + "learning_rate": 4.169870732973767e-06, + "loss": 0.7141, + "step": 2167 + }, + { + "epoch": 1.0754623308923916, + "grad_norm": 0.14029339027722842, + "learning_rate": 4.169143254280074e-06, + "loss": 0.7285, + "step": 2168 + }, + { + "epoch": 1.0759587935956312, + "grad_norm": 0.14109579501178382, + "learning_rate": 4.168415520484598e-06, + "loss": 0.7896, + "step": 2169 + }, + { + "epoch": 1.0764552562988705, + "grad_norm": 0.144954703610336, + "learning_rate": 4.167687531698561e-06, + "loss": 0.7255, + "step": 2170 + }, + { + "epoch": 1.07695171900211, + "grad_norm": 0.13685269108041861, + "learning_rate": 4.166959288033223e-06, + "loss": 0.7257, + "step": 2171 + }, + { + "epoch": 1.0774481817053494, + "grad_norm": 0.14271875588892513, + "learning_rate": 4.166230789599886e-06, + "loss": 0.7159, + "step": 2172 + }, + { + "epoch": 1.0779446444085887, + "grad_norm": 0.13783479927505968, + "learning_rate": 4.16550203650989e-06, + "loss": 0.7129, + "step": 2173 + }, + { + "epoch": 1.0784411071118283, + "grad_norm": 0.14283091206105697, + "learning_rate": 4.16477302887461e-06, + "loss": 0.6965, + "step": 2174 + }, + { + "epoch": 1.0789375698150676, + "grad_norm": 0.14010606156996824, + "learning_rate": 4.1640437668054665e-06, + "loss": 0.7179, + "step": 2175 + }, + { + "epoch": 1.0794340325183072, + "grad_norm": 0.14730576733153716, + "learning_rate": 4.163314250413913e-06, + "loss": 0.7082, + "step": 2176 + }, + { + "epoch": 1.0799304952215465, + "grad_norm": 0.1335227779819202, + "learning_rate": 4.1625844798114464e-06, + "loss": 0.7183, + "step": 2177 + }, + { + "epoch": 1.0804269579247858, + "grad_norm": 0.13601772614825755, + "learning_rate": 4.1618544551096005e-06, + "loss": 0.6847, + "step": 2178 + }, + { + "epoch": 1.0809234206280254, + "grad_norm": 0.13698194445852674, + "learning_rate": 4.1611241764199465e-06, + "loss": 0.7435, + "step": 2179 + }, + { + "epoch": 1.0814198833312647, + "grad_norm": 0.14062893743840316, + "learning_rate": 4.160393643854095e-06, + "loss": 0.7363, + "step": 2180 + }, + { + "epoch": 1.0819163460345043, + "grad_norm": 0.13799273854449803, + "learning_rate": 4.159662857523697e-06, + "loss": 0.7243, + "step": 2181 + }, + { + "epoch": 1.0824128087377436, + "grad_norm": 0.13716660868793465, + "learning_rate": 4.158931817540443e-06, + "loss": 0.7577, + "step": 2182 + }, + { + "epoch": 1.082909271440983, + "grad_norm": 0.1466433823136944, + "learning_rate": 4.158200524016061e-06, + "loss": 0.7054, + "step": 2183 + }, + { + "epoch": 1.0834057341442225, + "grad_norm": 0.1382394309059659, + "learning_rate": 4.1574689770623145e-06, + "loss": 0.7566, + "step": 2184 + }, + { + "epoch": 1.0839021968474618, + "grad_norm": 0.13654874611387244, + "learning_rate": 4.15673717679101e-06, + "loss": 0.7251, + "step": 2185 + }, + { + "epoch": 1.0843986595507014, + "grad_norm": 0.1367869405798336, + "learning_rate": 4.156005123313993e-06, + "loss": 0.719, + "step": 2186 + }, + { + "epoch": 1.0848951222539407, + "grad_norm": 0.1317568061970169, + "learning_rate": 4.155272816743145e-06, + "loss": 0.6876, + "step": 2187 + }, + { + "epoch": 1.08539158495718, + "grad_norm": 0.13762978347128463, + "learning_rate": 4.1545402571903855e-06, + "loss": 0.7022, + "step": 2188 + }, + { + "epoch": 1.0858880476604196, + "grad_norm": 0.1362143217786678, + "learning_rate": 4.153807444767677e-06, + "loss": 0.7125, + "step": 2189 + }, + { + "epoch": 1.086384510363659, + "grad_norm": 0.1516046995027248, + "learning_rate": 4.153074379587018e-06, + "loss": 0.809, + "step": 2190 + }, + { + "epoch": 1.0868809730668985, + "grad_norm": 0.13562138385068911, + "learning_rate": 4.152341061760445e-06, + "loss": 0.7172, + "step": 2191 + }, + { + "epoch": 1.0873774357701378, + "grad_norm": 0.14008342460158815, + "learning_rate": 4.151607491400034e-06, + "loss": 0.7781, + "step": 2192 + }, + { + "epoch": 1.0878738984733771, + "grad_norm": 0.13999792205054443, + "learning_rate": 4.150873668617899e-06, + "loss": 0.6984, + "step": 2193 + }, + { + "epoch": 1.0883703611766167, + "grad_norm": 0.12918569307833325, + "learning_rate": 4.150139593526193e-06, + "loss": 0.707, + "step": 2194 + }, + { + "epoch": 1.088866823879856, + "grad_norm": 0.1373176298653365, + "learning_rate": 4.149405266237109e-06, + "loss": 0.7073, + "step": 2195 + }, + { + "epoch": 1.0893632865830956, + "grad_norm": 0.12770538449262095, + "learning_rate": 4.148670686862877e-06, + "loss": 0.6754, + "step": 2196 + }, + { + "epoch": 1.089859749286335, + "grad_norm": 0.14027683877629007, + "learning_rate": 4.147935855515763e-06, + "loss": 0.7618, + "step": 2197 + }, + { + "epoch": 1.0903562119895742, + "grad_norm": 0.13474412755523313, + "learning_rate": 4.1472007723080774e-06, + "loss": 0.7486, + "step": 2198 + }, + { + "epoch": 1.0908526746928138, + "grad_norm": 0.13436431752038755, + "learning_rate": 4.146465437352164e-06, + "loss": 0.7237, + "step": 2199 + }, + { + "epoch": 1.091349137396053, + "grad_norm": 0.13682091282171036, + "learning_rate": 4.145729850760408e-06, + "loss": 0.7377, + "step": 2200 + }, + { + "epoch": 1.0918456000992927, + "grad_norm": 0.13924706240826093, + "learning_rate": 4.144994012645232e-06, + "loss": 0.7402, + "step": 2201 + }, + { + "epoch": 1.092342062802532, + "grad_norm": 0.14103770773255148, + "learning_rate": 4.1442579231190964e-06, + "loss": 0.69, + "step": 2202 + }, + { + "epoch": 1.0928385255057713, + "grad_norm": 0.1394654587729206, + "learning_rate": 4.143521582294501e-06, + "loss": 0.7393, + "step": 2203 + }, + { + "epoch": 1.0933349882090109, + "grad_norm": 0.13644960570918493, + "learning_rate": 4.1427849902839826e-06, + "loss": 0.7141, + "step": 2204 + }, + { + "epoch": 1.0938314509122502, + "grad_norm": 0.13692258907444782, + "learning_rate": 4.142048147200119e-06, + "loss": 0.7393, + "step": 2205 + }, + { + "epoch": 1.0943279136154895, + "grad_norm": 0.1373142702334565, + "learning_rate": 4.141311053155524e-06, + "loss": 0.755, + "step": 2206 + }, + { + "epoch": 1.094824376318729, + "grad_norm": 0.13763207554959367, + "learning_rate": 4.140573708262852e-06, + "loss": 0.7546, + "step": 2207 + }, + { + "epoch": 1.0953208390219684, + "grad_norm": 0.14478103422831126, + "learning_rate": 4.139836112634792e-06, + "loss": 0.7679, + "step": 2208 + }, + { + "epoch": 1.095817301725208, + "grad_norm": 0.1390259770870048, + "learning_rate": 4.139098266384076e-06, + "loss": 0.743, + "step": 2209 + }, + { + "epoch": 1.0963137644284473, + "grad_norm": 0.14390923525858076, + "learning_rate": 4.138360169623471e-06, + "loss": 0.7453, + "step": 2210 + }, + { + "epoch": 1.0968102271316866, + "grad_norm": 0.14517278952992427, + "learning_rate": 4.137621822465782e-06, + "loss": 0.7426, + "step": 2211 + }, + { + "epoch": 1.0973066898349262, + "grad_norm": 0.15525845893147422, + "learning_rate": 4.1368832250238564e-06, + "loss": 0.7313, + "step": 2212 + }, + { + "epoch": 1.0978031525381655, + "grad_norm": 0.14066801820933797, + "learning_rate": 4.136144377410574e-06, + "loss": 0.7877, + "step": 2213 + }, + { + "epoch": 1.098299615241405, + "grad_norm": 0.1424752647162409, + "learning_rate": 4.135405279738858e-06, + "loss": 0.7418, + "step": 2214 + }, + { + "epoch": 1.0987960779446444, + "grad_norm": 0.15190627011179594, + "learning_rate": 4.134665932121665e-06, + "loss": 0.7654, + "step": 2215 + }, + { + "epoch": 1.0992925406478837, + "grad_norm": 0.1412407950996624, + "learning_rate": 4.133926334671996e-06, + "loss": 0.6707, + "step": 2216 + }, + { + "epoch": 1.0997890033511233, + "grad_norm": 0.14456861330667964, + "learning_rate": 4.133186487502884e-06, + "loss": 0.7841, + "step": 2217 + }, + { + "epoch": 1.1002854660543626, + "grad_norm": 0.15407876229352044, + "learning_rate": 4.1324463907274035e-06, + "loss": 0.7367, + "step": 2218 + }, + { + "epoch": 1.1007819287576022, + "grad_norm": 0.14216692584554558, + "learning_rate": 4.131706044458667e-06, + "loss": 0.7149, + "step": 2219 + }, + { + "epoch": 1.1012783914608415, + "grad_norm": 0.1324859332684942, + "learning_rate": 4.130965448809824e-06, + "loss": 0.6563, + "step": 2220 + }, + { + "epoch": 1.1017748541640808, + "grad_norm": 0.13983336523306109, + "learning_rate": 4.130224603894062e-06, + "loss": 0.7078, + "step": 2221 + }, + { + "epoch": 1.1022713168673204, + "grad_norm": 0.14405406563289364, + "learning_rate": 4.129483509824608e-06, + "loss": 0.7529, + "step": 2222 + }, + { + "epoch": 1.1027677795705597, + "grad_norm": 0.14158523055382277, + "learning_rate": 4.128742166714726e-06, + "loss": 0.7113, + "step": 2223 + }, + { + "epoch": 1.1032642422737993, + "grad_norm": 0.13671925424224868, + "learning_rate": 4.128000574677719e-06, + "loss": 0.746, + "step": 2224 + }, + { + "epoch": 1.1037607049770386, + "grad_norm": 0.13505116983586582, + "learning_rate": 4.127258733826929e-06, + "loss": 0.7181, + "step": 2225 + }, + { + "epoch": 1.104257167680278, + "grad_norm": 0.1374278474736183, + "learning_rate": 4.126516644275731e-06, + "loss": 0.7529, + "step": 2226 + }, + { + "epoch": 1.1047536303835175, + "grad_norm": 0.14165368952037294, + "learning_rate": 4.125774306137543e-06, + "loss": 0.7476, + "step": 2227 + }, + { + "epoch": 1.1052500930867568, + "grad_norm": 0.15143713359026406, + "learning_rate": 4.12503171952582e-06, + "loss": 0.7326, + "step": 2228 + }, + { + "epoch": 1.1057465557899964, + "grad_norm": 0.137643913086052, + "learning_rate": 4.124288884554053e-06, + "loss": 0.7351, + "step": 2229 + }, + { + "epoch": 1.1062430184932357, + "grad_norm": 0.13550077923391268, + "learning_rate": 4.123545801335776e-06, + "loss": 0.7181, + "step": 2230 + }, + { + "epoch": 1.106739481196475, + "grad_norm": 0.13907847760418357, + "learning_rate": 4.122802469984552e-06, + "loss": 0.7113, + "step": 2231 + }, + { + "epoch": 1.1072359438997146, + "grad_norm": 0.1409638418684744, + "learning_rate": 4.122058890613991e-06, + "loss": 0.7532, + "step": 2232 + }, + { + "epoch": 1.107732406602954, + "grad_norm": 0.13839521717196338, + "learning_rate": 4.121315063337737e-06, + "loss": 0.7593, + "step": 2233 + }, + { + "epoch": 1.1082288693061935, + "grad_norm": 0.14464848496201313, + "learning_rate": 4.120570988269472e-06, + "loss": 0.798, + "step": 2234 + }, + { + "epoch": 1.1087253320094328, + "grad_norm": 0.14170615709306067, + "learning_rate": 4.119826665522914e-06, + "loss": 0.7214, + "step": 2235 + }, + { + "epoch": 1.1092217947126721, + "grad_norm": 0.13750992725040465, + "learning_rate": 4.119082095211823e-06, + "loss": 0.7025, + "step": 2236 + }, + { + "epoch": 1.1097182574159117, + "grad_norm": 0.13524887733070357, + "learning_rate": 4.118337277449993e-06, + "loss": 0.7299, + "step": 2237 + }, + { + "epoch": 1.110214720119151, + "grad_norm": 0.13742212811395596, + "learning_rate": 4.117592212351258e-06, + "loss": 0.7152, + "step": 2238 + }, + { + "epoch": 1.1107111828223906, + "grad_norm": 0.13549859577744783, + "learning_rate": 4.11684690002949e-06, + "loss": 0.7939, + "step": 2239 + }, + { + "epoch": 1.11120764552563, + "grad_norm": 0.13072436304815396, + "learning_rate": 4.116101340598597e-06, + "loss": 0.6907, + "step": 2240 + }, + { + "epoch": 1.1117041082288692, + "grad_norm": 0.13708530876777447, + "learning_rate": 4.115355534172527e-06, + "loss": 0.7668, + "step": 2241 + }, + { + "epoch": 1.1122005709321088, + "grad_norm": 0.1331260842138928, + "learning_rate": 4.114609480865264e-06, + "loss": 0.7315, + "step": 2242 + }, + { + "epoch": 1.112697033635348, + "grad_norm": 0.1345957058229882, + "learning_rate": 4.113863180790829e-06, + "loss": 0.7249, + "step": 2243 + }, + { + "epoch": 1.1131934963385877, + "grad_norm": 0.13896142397489988, + "learning_rate": 4.113116634063285e-06, + "loss": 0.7174, + "step": 2244 + }, + { + "epoch": 1.113689959041827, + "grad_norm": 0.13616762035864557, + "learning_rate": 4.1123698407967265e-06, + "loss": 0.7206, + "step": 2245 + }, + { + "epoch": 1.1141864217450663, + "grad_norm": 0.1428487363329108, + "learning_rate": 4.11162280110529e-06, + "loss": 0.7447, + "step": 2246 + }, + { + "epoch": 1.1146828844483059, + "grad_norm": 0.1381247261812151, + "learning_rate": 4.110875515103148e-06, + "loss": 0.7262, + "step": 2247 + }, + { + "epoch": 1.1151793471515452, + "grad_norm": 0.13985941699766852, + "learning_rate": 4.110127982904513e-06, + "loss": 0.737, + "step": 2248 + }, + { + "epoch": 1.1156758098547848, + "grad_norm": 0.1371407690195463, + "learning_rate": 4.109380204623634e-06, + "loss": 0.7367, + "step": 2249 + }, + { + "epoch": 1.116172272558024, + "grad_norm": 0.1410923264807246, + "learning_rate": 4.108632180374794e-06, + "loss": 0.685, + "step": 2250 + }, + { + "epoch": 1.1166687352612634, + "grad_norm": 0.13607561024834763, + "learning_rate": 4.107883910272316e-06, + "loss": 0.7416, + "step": 2251 + }, + { + "epoch": 1.117165197964503, + "grad_norm": 0.14401553966823152, + "learning_rate": 4.107135394430565e-06, + "loss": 0.7741, + "step": 2252 + }, + { + "epoch": 1.1176616606677423, + "grad_norm": 0.1347979188420261, + "learning_rate": 4.106386632963935e-06, + "loss": 0.7289, + "step": 2253 + }, + { + "epoch": 1.1181581233709816, + "grad_norm": 0.13563125984475435, + "learning_rate": 4.105637625986867e-06, + "loss": 0.7241, + "step": 2254 + }, + { + "epoch": 1.1186545860742212, + "grad_norm": 0.13554783899608014, + "learning_rate": 4.104888373613832e-06, + "loss": 0.7051, + "step": 2255 + }, + { + "epoch": 1.1191510487774605, + "grad_norm": 0.13902713235820974, + "learning_rate": 4.10413887595934e-06, + "loss": 0.7442, + "step": 2256 + }, + { + "epoch": 1.1196475114807, + "grad_norm": 0.1416398097741949, + "learning_rate": 4.1033891331379425e-06, + "loss": 0.7234, + "step": 2257 + }, + { + "epoch": 1.1201439741839394, + "grad_norm": 0.13151231872027344, + "learning_rate": 4.102639145264223e-06, + "loss": 0.6901, + "step": 2258 + }, + { + "epoch": 1.1206404368871787, + "grad_norm": 0.1310560747375517, + "learning_rate": 4.101888912452809e-06, + "loss": 0.7034, + "step": 2259 + }, + { + "epoch": 1.1211368995904183, + "grad_norm": 0.13368674747757894, + "learning_rate": 4.1011384348183565e-06, + "loss": 0.7369, + "step": 2260 + }, + { + "epoch": 1.1216333622936576, + "grad_norm": 0.14030562550210587, + "learning_rate": 4.100387712475568e-06, + "loss": 0.7473, + "step": 2261 + }, + { + "epoch": 1.1221298249968972, + "grad_norm": 0.13783403570651975, + "learning_rate": 4.0996367455391774e-06, + "loss": 0.6827, + "step": 2262 + }, + { + "epoch": 1.1226262877001365, + "grad_norm": 0.13794119866048046, + "learning_rate": 4.098885534123958e-06, + "loss": 0.7144, + "step": 2263 + }, + { + "epoch": 1.1231227504033758, + "grad_norm": 0.1403803932613888, + "learning_rate": 4.098134078344722e-06, + "loss": 0.7752, + "step": 2264 + }, + { + "epoch": 1.1236192131066154, + "grad_norm": 0.1393102973681631, + "learning_rate": 4.097382378316316e-06, + "loss": 0.7241, + "step": 2265 + }, + { + "epoch": 1.1241156758098547, + "grad_norm": 0.13730972644736142, + "learning_rate": 4.0966304341536255e-06, + "loss": 0.7399, + "step": 2266 + }, + { + "epoch": 1.1246121385130943, + "grad_norm": 0.14795637490908276, + "learning_rate": 4.095878245971573e-06, + "loss": 0.7947, + "step": 2267 + }, + { + "epoch": 1.1251086012163336, + "grad_norm": 0.13800836804454775, + "learning_rate": 4.095125813885118e-06, + "loss": 0.7454, + "step": 2268 + }, + { + "epoch": 1.125605063919573, + "grad_norm": 0.13510837243563636, + "learning_rate": 4.094373138009259e-06, + "loss": 0.7126, + "step": 2269 + }, + { + "epoch": 1.1261015266228125, + "grad_norm": 0.14183611387295958, + "learning_rate": 4.093620218459029e-06, + "loss": 0.7305, + "step": 2270 + }, + { + "epoch": 1.1265979893260518, + "grad_norm": 0.14547694629965133, + "learning_rate": 4.092867055349501e-06, + "loss": 0.7342, + "step": 2271 + }, + { + "epoch": 1.1270944520292914, + "grad_norm": 0.13272000346114335, + "learning_rate": 4.092113648795784e-06, + "loss": 0.7244, + "step": 2272 + }, + { + "epoch": 1.1275909147325307, + "grad_norm": 0.13445748304111405, + "learning_rate": 4.091359998913021e-06, + "loss": 0.7135, + "step": 2273 + }, + { + "epoch": 1.12808737743577, + "grad_norm": 0.13823114531273367, + "learning_rate": 4.0906061058164e-06, + "loss": 0.7296, + "step": 2274 + }, + { + "epoch": 1.1285838401390096, + "grad_norm": 0.1331655131810508, + "learning_rate": 4.089851969621138e-06, + "loss": 0.7039, + "step": 2275 + }, + { + "epoch": 1.129080302842249, + "grad_norm": 0.13502020601948467, + "learning_rate": 4.0890975904424946e-06, + "loss": 0.7227, + "step": 2276 + }, + { + "epoch": 1.1295767655454885, + "grad_norm": 0.13554787632722626, + "learning_rate": 4.088342968395763e-06, + "loss": 0.7166, + "step": 2277 + }, + { + "epoch": 1.1300732282487278, + "grad_norm": 0.15351257577564006, + "learning_rate": 4.087588103596276e-06, + "loss": 0.7515, + "step": 2278 + }, + { + "epoch": 1.1305696909519671, + "grad_norm": 0.13285248996749063, + "learning_rate": 4.0868329961594025e-06, + "loss": 0.7398, + "step": 2279 + }, + { + "epoch": 1.1310661536552067, + "grad_norm": 0.14474300166445284, + "learning_rate": 4.086077646200548e-06, + "loss": 0.765, + "step": 2280 + }, + { + "epoch": 1.131562616358446, + "grad_norm": 0.136914338868158, + "learning_rate": 4.085322053835157e-06, + "loss": 0.7387, + "step": 2281 + }, + { + "epoch": 1.1320590790616856, + "grad_norm": 0.13701247994557483, + "learning_rate": 4.084566219178708e-06, + "loss": 0.7484, + "step": 2282 + }, + { + "epoch": 1.132555541764925, + "grad_norm": 0.14213309578391173, + "learning_rate": 4.08381014234672e-06, + "loss": 0.734, + "step": 2283 + }, + { + "epoch": 1.1330520044681642, + "grad_norm": 0.1413403485403749, + "learning_rate": 4.0830538234547455e-06, + "loss": 0.739, + "step": 2284 + }, + { + "epoch": 1.1335484671714038, + "grad_norm": 0.14178436767190955, + "learning_rate": 4.082297262618376e-06, + "loss": 0.7338, + "step": 2285 + }, + { + "epoch": 1.1340449298746431, + "grad_norm": 0.1384665628085734, + "learning_rate": 4.081540459953241e-06, + "loss": 0.7293, + "step": 2286 + }, + { + "epoch": 1.1345413925778827, + "grad_norm": 0.14117953096422015, + "learning_rate": 4.080783415575004e-06, + "loss": 0.7241, + "step": 2287 + }, + { + "epoch": 1.135037855281122, + "grad_norm": 0.13841249272824596, + "learning_rate": 4.080026129599368e-06, + "loss": 0.7129, + "step": 2288 + }, + { + "epoch": 1.1355343179843613, + "grad_norm": 0.1389416393419498, + "learning_rate": 4.079268602142072e-06, + "loss": 0.7196, + "step": 2289 + }, + { + "epoch": 1.1360307806876009, + "grad_norm": 0.13525972832589087, + "learning_rate": 4.078510833318892e-06, + "loss": 0.6799, + "step": 2290 + }, + { + "epoch": 1.1365272433908402, + "grad_norm": 0.14116131185065883, + "learning_rate": 4.077752823245641e-06, + "loss": 0.7207, + "step": 2291 + }, + { + "epoch": 1.1370237060940798, + "grad_norm": 0.13868074387182236, + "learning_rate": 4.076994572038168e-06, + "loss": 0.7414, + "step": 2292 + }, + { + "epoch": 1.137520168797319, + "grad_norm": 0.13837495741874617, + "learning_rate": 4.076236079812359e-06, + "loss": 0.7518, + "step": 2293 + }, + { + "epoch": 1.1380166315005584, + "grad_norm": 0.13909330761790814, + "learning_rate": 4.075477346684139e-06, + "loss": 0.7353, + "step": 2294 + }, + { + "epoch": 1.138513094203798, + "grad_norm": 0.15340107106559084, + "learning_rate": 4.074718372769467e-06, + "loss": 0.7625, + "step": 2295 + }, + { + "epoch": 1.1390095569070373, + "grad_norm": 0.14086941509084788, + "learning_rate": 4.07395915818434e-06, + "loss": 0.7586, + "step": 2296 + }, + { + "epoch": 1.1395060196102769, + "grad_norm": 0.13446534552526093, + "learning_rate": 4.073199703044793e-06, + "loss": 0.7621, + "step": 2297 + }, + { + "epoch": 1.1400024823135162, + "grad_norm": 0.1373097144881244, + "learning_rate": 4.072440007466896e-06, + "loss": 0.6873, + "step": 2298 + }, + { + "epoch": 1.1404989450167555, + "grad_norm": 0.13928210663269355, + "learning_rate": 4.071680071566756e-06, + "loss": 0.7514, + "step": 2299 + }, + { + "epoch": 1.140995407719995, + "grad_norm": 0.1421392811034866, + "learning_rate": 4.070919895460517e-06, + "loss": 0.7295, + "step": 2300 + }, + { + "epoch": 1.1414918704232344, + "grad_norm": 0.13241332535161587, + "learning_rate": 4.070159479264359e-06, + "loss": 0.7118, + "step": 2301 + }, + { + "epoch": 1.141988333126474, + "grad_norm": 0.1302518945245061, + "learning_rate": 4.0693988230945e-06, + "loss": 0.7399, + "step": 2302 + }, + { + "epoch": 1.1424847958297133, + "grad_norm": 0.14324912590708974, + "learning_rate": 4.0686379270671955e-06, + "loss": 0.768, + "step": 2303 + }, + { + "epoch": 1.1429812585329526, + "grad_norm": 0.14029305482378046, + "learning_rate": 4.067876791298734e-06, + "loss": 0.7787, + "step": 2304 + }, + { + "epoch": 1.1434777212361922, + "grad_norm": 0.15155995127547886, + "learning_rate": 4.067115415905445e-06, + "loss": 0.7777, + "step": 2305 + }, + { + "epoch": 1.1439741839394315, + "grad_norm": 0.13840654029220392, + "learning_rate": 4.066353801003691e-06, + "loss": 0.7449, + "step": 2306 + }, + { + "epoch": 1.144470646642671, + "grad_norm": 0.1379028297317787, + "learning_rate": 4.065591946709873e-06, + "loss": 0.7468, + "step": 2307 + }, + { + "epoch": 1.1449671093459104, + "grad_norm": 0.13315214722190571, + "learning_rate": 4.064829853140428e-06, + "loss": 0.7142, + "step": 2308 + }, + { + "epoch": 1.1454635720491497, + "grad_norm": 0.13949231615359325, + "learning_rate": 4.0640675204118305e-06, + "loss": 0.6793, + "step": 2309 + }, + { + "epoch": 1.1459600347523893, + "grad_norm": 0.14382388332090082, + "learning_rate": 4.06330494864059e-06, + "loss": 0.7437, + "step": 2310 + }, + { + "epoch": 1.1464564974556286, + "grad_norm": 0.14590984361270626, + "learning_rate": 4.062542137943254e-06, + "loss": 0.8103, + "step": 2311 + }, + { + "epoch": 1.1469529601588682, + "grad_norm": 0.1609583000360408, + "learning_rate": 4.061779088436406e-06, + "loss": 0.7311, + "step": 2312 + }, + { + "epoch": 1.1474494228621075, + "grad_norm": 0.13805487885883164, + "learning_rate": 4.061015800236665e-06, + "loss": 0.7186, + "step": 2313 + }, + { + "epoch": 1.1479458855653468, + "grad_norm": 0.13893797112646772, + "learning_rate": 4.060252273460688e-06, + "loss": 0.7381, + "step": 2314 + }, + { + "epoch": 1.1484423482685864, + "grad_norm": 0.13923921175817758, + "learning_rate": 4.0594885082251685e-06, + "loss": 0.7233, + "step": 2315 + }, + { + "epoch": 1.1489388109718257, + "grad_norm": 0.13902288626720571, + "learning_rate": 4.058724504646834e-06, + "loss": 0.7134, + "step": 2316 + }, + { + "epoch": 1.1494352736750653, + "grad_norm": 0.13827739981540818, + "learning_rate": 4.057960262842452e-06, + "loss": 0.7263, + "step": 2317 + }, + { + "epoch": 1.1499317363783046, + "grad_norm": 0.14118983269415886, + "learning_rate": 4.057195782928823e-06, + "loss": 0.7381, + "step": 2318 + }, + { + "epoch": 1.150428199081544, + "grad_norm": 0.1322019940248954, + "learning_rate": 4.056431065022787e-06, + "loss": 0.7327, + "step": 2319 + }, + { + "epoch": 1.1509246617847835, + "grad_norm": 0.1313464348071736, + "learning_rate": 4.055666109241218e-06, + "loss": 0.6943, + "step": 2320 + }, + { + "epoch": 1.1514211244880228, + "grad_norm": 0.13797475553246952, + "learning_rate": 4.054900915701028e-06, + "loss": 0.7245, + "step": 2321 + }, + { + "epoch": 1.1519175871912624, + "grad_norm": 0.15720508745478762, + "learning_rate": 4.054135484519163e-06, + "loss": 0.6975, + "step": 2322 + }, + { + "epoch": 1.1524140498945017, + "grad_norm": 0.14150123947563276, + "learning_rate": 4.053369815812608e-06, + "loss": 0.7432, + "step": 2323 + }, + { + "epoch": 1.152910512597741, + "grad_norm": 0.13546580390502527, + "learning_rate": 4.052603909698383e-06, + "loss": 0.6945, + "step": 2324 + }, + { + "epoch": 1.1534069753009806, + "grad_norm": 0.13797133959426647, + "learning_rate": 4.051837766293545e-06, + "loss": 0.7659, + "step": 2325 + }, + { + "epoch": 1.15390343800422, + "grad_norm": 0.13678662933843044, + "learning_rate": 4.051071385715186e-06, + "loss": 0.7662, + "step": 2326 + }, + { + "epoch": 1.1543999007074595, + "grad_norm": 0.1456527753758712, + "learning_rate": 4.0503047680804354e-06, + "loss": 0.7451, + "step": 2327 + }, + { + "epoch": 1.1548963634106988, + "grad_norm": 0.13498395892098344, + "learning_rate": 4.0495379135064585e-06, + "loss": 0.7201, + "step": 2328 + }, + { + "epoch": 1.1553928261139381, + "grad_norm": 0.13613972964983925, + "learning_rate": 4.048770822110456e-06, + "loss": 0.692, + "step": 2329 + }, + { + "epoch": 1.1558892888171777, + "grad_norm": 0.13981046419867807, + "learning_rate": 4.048003494009666e-06, + "loss": 0.7372, + "step": 2330 + }, + { + "epoch": 1.156385751520417, + "grad_norm": 0.13473584929200108, + "learning_rate": 4.047235929321363e-06, + "loss": 0.7036, + "step": 2331 + }, + { + "epoch": 1.1568822142236566, + "grad_norm": 0.13805910028899931, + "learning_rate": 4.0464681281628545e-06, + "loss": 0.6842, + "step": 2332 + }, + { + "epoch": 1.1573786769268959, + "grad_norm": 0.139073362785338, + "learning_rate": 4.04570009065149e-06, + "loss": 0.7541, + "step": 2333 + }, + { + "epoch": 1.1578751396301352, + "grad_norm": 0.13908629112696577, + "learning_rate": 4.044931816904649e-06, + "loss": 0.6851, + "step": 2334 + }, + { + "epoch": 1.1583716023333748, + "grad_norm": 0.13330790586530183, + "learning_rate": 4.044163307039751e-06, + "loss": 0.7023, + "step": 2335 + }, + { + "epoch": 1.158868065036614, + "grad_norm": 0.1516948361883498, + "learning_rate": 4.043394561174252e-06, + "loss": 0.7247, + "step": 2336 + }, + { + "epoch": 1.1593645277398537, + "grad_norm": 0.13561857587166704, + "learning_rate": 4.042625579425639e-06, + "loss": 0.6929, + "step": 2337 + }, + { + "epoch": 1.159860990443093, + "grad_norm": 0.1381505731871968, + "learning_rate": 4.04185636191144e-06, + "loss": 0.7877, + "step": 2338 + }, + { + "epoch": 1.1603574531463323, + "grad_norm": 0.14465367702750925, + "learning_rate": 4.04108690874922e-06, + "loss": 0.7335, + "step": 2339 + }, + { + "epoch": 1.1608539158495719, + "grad_norm": 0.13856535592329788, + "learning_rate": 4.040317220056574e-06, + "loss": 0.7261, + "step": 2340 + }, + { + "epoch": 1.1613503785528112, + "grad_norm": 0.1359117640924008, + "learning_rate": 4.039547295951138e-06, + "loss": 0.7154, + "step": 2341 + }, + { + "epoch": 1.1618468412560508, + "grad_norm": 0.14408892510554583, + "learning_rate": 4.038777136550583e-06, + "loss": 0.7449, + "step": 2342 + }, + { + "epoch": 1.16234330395929, + "grad_norm": 0.1392596970150696, + "learning_rate": 4.038006741972614e-06, + "loss": 0.7677, + "step": 2343 + }, + { + "epoch": 1.1628397666625294, + "grad_norm": 0.13340381888036884, + "learning_rate": 4.037236112334976e-06, + "loss": 0.6938, + "step": 2344 + }, + { + "epoch": 1.163336229365769, + "grad_norm": 0.138524100141458, + "learning_rate": 4.036465247755444e-06, + "loss": 0.678, + "step": 2345 + }, + { + "epoch": 1.1638326920690083, + "grad_norm": 0.13602968738361657, + "learning_rate": 4.035694148351836e-06, + "loss": 0.7189, + "step": 2346 + }, + { + "epoch": 1.1643291547722479, + "grad_norm": 0.13592578354190615, + "learning_rate": 4.034922814242001e-06, + "loss": 0.6993, + "step": 2347 + }, + { + "epoch": 1.1648256174754872, + "grad_norm": 0.13795965217373635, + "learning_rate": 4.034151245543823e-06, + "loss": 0.7135, + "step": 2348 + }, + { + "epoch": 1.1653220801787265, + "grad_norm": 0.13946922199269032, + "learning_rate": 4.033379442375225e-06, + "loss": 0.7757, + "step": 2349 + }, + { + "epoch": 1.165818542881966, + "grad_norm": 0.14045917203188657, + "learning_rate": 4.032607404854166e-06, + "loss": 0.7284, + "step": 2350 + }, + { + "epoch": 1.1663150055852054, + "grad_norm": 0.1440756518140952, + "learning_rate": 4.031835133098639e-06, + "loss": 0.7247, + "step": 2351 + }, + { + "epoch": 1.166811468288445, + "grad_norm": 0.13724789791500103, + "learning_rate": 4.031062627226671e-06, + "loss": 0.6963, + "step": 2352 + }, + { + "epoch": 1.1673079309916843, + "grad_norm": 0.13553830052897844, + "learning_rate": 4.030289887356332e-06, + "loss": 0.6995, + "step": 2353 + }, + { + "epoch": 1.1678043936949236, + "grad_norm": 0.13522250185478063, + "learning_rate": 4.029516913605719e-06, + "loss": 0.7359, + "step": 2354 + }, + { + "epoch": 1.1683008563981632, + "grad_norm": 0.1386403051237985, + "learning_rate": 4.028743706092969e-06, + "loss": 0.7783, + "step": 2355 + }, + { + "epoch": 1.1687973191014025, + "grad_norm": 0.14135795928242792, + "learning_rate": 4.027970264936256e-06, + "loss": 0.7393, + "step": 2356 + }, + { + "epoch": 1.1692937818046418, + "grad_norm": 0.15298427203204537, + "learning_rate": 4.027196590253786e-06, + "loss": 0.7416, + "step": 2357 + }, + { + "epoch": 1.1697902445078814, + "grad_norm": 0.13356310495808743, + "learning_rate": 4.026422682163804e-06, + "loss": 0.7174, + "step": 2358 + }, + { + "epoch": 1.1702867072111207, + "grad_norm": 0.14258979851062023, + "learning_rate": 4.02564854078459e-06, + "loss": 0.7025, + "step": 2359 + }, + { + "epoch": 1.1707831699143603, + "grad_norm": 0.1394231832343289, + "learning_rate": 4.024874166234459e-06, + "loss": 0.7084, + "step": 2360 + }, + { + "epoch": 1.1712796326175996, + "grad_norm": 0.1311008710936396, + "learning_rate": 4.02409955863176e-06, + "loss": 0.7364, + "step": 2361 + }, + { + "epoch": 1.171776095320839, + "grad_norm": 0.13638386404046124, + "learning_rate": 4.023324718094881e-06, + "loss": 0.7396, + "step": 2362 + }, + { + "epoch": 1.1722725580240785, + "grad_norm": 0.14537907916628248, + "learning_rate": 4.022549644742244e-06, + "loss": 0.6914, + "step": 2363 + }, + { + "epoch": 1.1727690207273178, + "grad_norm": 0.14040298917938324, + "learning_rate": 4.0217743386923055e-06, + "loss": 0.7228, + "step": 2364 + }, + { + "epoch": 1.1732654834305574, + "grad_norm": 0.14446607932799382, + "learning_rate": 4.020998800063559e-06, + "loss": 0.7441, + "step": 2365 + }, + { + "epoch": 1.1737619461337967, + "grad_norm": 0.13665124704375234, + "learning_rate": 4.020223028974534e-06, + "loss": 0.7516, + "step": 2366 + }, + { + "epoch": 1.174258408837036, + "grad_norm": 0.13399912735468633, + "learning_rate": 4.019447025543793e-06, + "loss": 0.7183, + "step": 2367 + }, + { + "epoch": 1.1747548715402756, + "grad_norm": 0.1464865537624882, + "learning_rate": 4.018670789889938e-06, + "loss": 0.7937, + "step": 2368 + }, + { + "epoch": 1.175251334243515, + "grad_norm": 0.1420811329454974, + "learning_rate": 4.0178943221316014e-06, + "loss": 0.728, + "step": 2369 + }, + { + "epoch": 1.1757477969467545, + "grad_norm": 0.1432300356176437, + "learning_rate": 4.0171176223874555e-06, + "loss": 0.657, + "step": 2370 + }, + { + "epoch": 1.1762442596499938, + "grad_norm": 0.1432930911399873, + "learning_rate": 4.0163406907762074e-06, + "loss": 0.76, + "step": 2371 + }, + { + "epoch": 1.1767407223532331, + "grad_norm": 0.14260952105955038, + "learning_rate": 4.015563527416596e-06, + "loss": 0.75, + "step": 2372 + }, + { + "epoch": 1.1772371850564727, + "grad_norm": 0.14070002945171584, + "learning_rate": 4.0147861324273994e-06, + "loss": 0.7111, + "step": 2373 + }, + { + "epoch": 1.177733647759712, + "grad_norm": 0.13824362597359197, + "learning_rate": 4.014008505927431e-06, + "loss": 0.7146, + "step": 2374 + }, + { + "epoch": 1.1782301104629516, + "grad_norm": 0.13615039910533921, + "learning_rate": 4.0132306480355374e-06, + "loss": 0.6898, + "step": 2375 + }, + { + "epoch": 1.178726573166191, + "grad_norm": 0.13640593448421545, + "learning_rate": 4.012452558870602e-06, + "loss": 0.7966, + "step": 2376 + }, + { + "epoch": 1.1792230358694302, + "grad_norm": 0.13629910190444364, + "learning_rate": 4.011674238551544e-06, + "loss": 0.751, + "step": 2377 + }, + { + "epoch": 1.1797194985726698, + "grad_norm": 0.13855859156200617, + "learning_rate": 4.010895687197316e-06, + "loss": 0.7481, + "step": 2378 + }, + { + "epoch": 1.180215961275909, + "grad_norm": 0.14195565031842594, + "learning_rate": 4.010116904926908e-06, + "loss": 0.7214, + "step": 2379 + }, + { + "epoch": 1.1807124239791487, + "grad_norm": 0.14168331391237854, + "learning_rate": 4.009337891859343e-06, + "loss": 0.7386, + "step": 2380 + }, + { + "epoch": 1.181208886682388, + "grad_norm": 0.13835896324890185, + "learning_rate": 4.008558648113682e-06, + "loss": 0.7219, + "step": 2381 + }, + { + "epoch": 1.1817053493856273, + "grad_norm": 0.14190577391365433, + "learning_rate": 4.0077791738090195e-06, + "loss": 0.7628, + "step": 2382 + }, + { + "epoch": 1.1822018120888669, + "grad_norm": 0.1341911874683844, + "learning_rate": 4.006999469064487e-06, + "loss": 0.6969, + "step": 2383 + }, + { + "epoch": 1.1826982747921062, + "grad_norm": 0.13969845313519358, + "learning_rate": 4.006219533999247e-06, + "loss": 0.7244, + "step": 2384 + }, + { + "epoch": 1.1831947374953455, + "grad_norm": 0.13979366267677537, + "learning_rate": 4.005439368732502e-06, + "loss": 0.7645, + "step": 2385 + }, + { + "epoch": 1.183691200198585, + "grad_norm": 0.1335214336393622, + "learning_rate": 4.0046589733834875e-06, + "loss": 0.688, + "step": 2386 + }, + { + "epoch": 1.1841876629018244, + "grad_norm": 0.14780650963457725, + "learning_rate": 4.003878348071474e-06, + "loss": 0.7289, + "step": 2387 + }, + { + "epoch": 1.184684125605064, + "grad_norm": 0.1340780920943342, + "learning_rate": 4.0030974929157685e-06, + "loss": 0.6878, + "step": 2388 + }, + { + "epoch": 1.1851805883083033, + "grad_norm": 0.1359899436643017, + "learning_rate": 4.002316408035711e-06, + "loss": 0.7322, + "step": 2389 + }, + { + "epoch": 1.1856770510115426, + "grad_norm": 0.13282726421917596, + "learning_rate": 4.001535093550678e-06, + "loss": 0.7032, + "step": 2390 + }, + { + "epoch": 1.1861735137147822, + "grad_norm": 0.13903709744169873, + "learning_rate": 4.000753549580082e-06, + "loss": 0.6988, + "step": 2391 + }, + { + "epoch": 1.1866699764180215, + "grad_norm": 0.1375603444876677, + "learning_rate": 3.999971776243369e-06, + "loss": 0.7292, + "step": 2392 + }, + { + "epoch": 1.187166439121261, + "grad_norm": 0.13837931077084456, + "learning_rate": 3.999189773660019e-06, + "loss": 0.7046, + "step": 2393 + }, + { + "epoch": 1.1876629018245004, + "grad_norm": 0.13869342809783666, + "learning_rate": 3.998407541949551e-06, + "loss": 0.6902, + "step": 2394 + }, + { + "epoch": 1.1881593645277397, + "grad_norm": 0.13979740548264746, + "learning_rate": 3.997625081231514e-06, + "loss": 0.7353, + "step": 2395 + }, + { + "epoch": 1.1886558272309793, + "grad_norm": 0.13527705164090786, + "learning_rate": 3.996842391625497e-06, + "loss": 0.726, + "step": 2396 + }, + { + "epoch": 1.1891522899342186, + "grad_norm": 0.14174483460790632, + "learning_rate": 3.99605947325112e-06, + "loss": 0.7005, + "step": 2397 + }, + { + "epoch": 1.1896487526374582, + "grad_norm": 0.13548057476768477, + "learning_rate": 3.99527632622804e-06, + "loss": 0.7066, + "step": 2398 + }, + { + "epoch": 1.1901452153406975, + "grad_norm": 0.13722530722455226, + "learning_rate": 3.994492950675949e-06, + "loss": 0.7293, + "step": 2399 + }, + { + "epoch": 1.1906416780439368, + "grad_norm": 0.14238980858246564, + "learning_rate": 3.9937093467145725e-06, + "loss": 0.7513, + "step": 2400 + }, + { + "epoch": 1.1911381407471764, + "grad_norm": 0.14077263746379465, + "learning_rate": 3.992925514463672e-06, + "loss": 0.6797, + "step": 2401 + }, + { + "epoch": 1.1916346034504157, + "grad_norm": 0.13906663141907152, + "learning_rate": 3.992141454043045e-06, + "loss": 0.705, + "step": 2402 + }, + { + "epoch": 1.1921310661536553, + "grad_norm": 0.14643513675700848, + "learning_rate": 3.99135716557252e-06, + "loss": 0.7516, + "step": 2403 + }, + { + "epoch": 1.1926275288568946, + "grad_norm": 0.1356991566400479, + "learning_rate": 3.990572649171964e-06, + "loss": 0.7296, + "step": 2404 + }, + { + "epoch": 1.193123991560134, + "grad_norm": 0.13520242104672095, + "learning_rate": 3.9897879049612795e-06, + "loss": 0.7248, + "step": 2405 + }, + { + "epoch": 1.1936204542633735, + "grad_norm": 0.14080671561882466, + "learning_rate": 3.9890029330604e-06, + "loss": 0.7011, + "step": 2406 + }, + { + "epoch": 1.1941169169666128, + "grad_norm": 0.14134376264347104, + "learning_rate": 3.988217733589296e-06, + "loss": 0.7477, + "step": 2407 + }, + { + "epoch": 1.1946133796698524, + "grad_norm": 0.13870900196918226, + "learning_rate": 3.987432306667975e-06, + "loss": 0.7606, + "step": 2408 + }, + { + "epoch": 1.1951098423730917, + "grad_norm": 0.1498135326898326, + "learning_rate": 3.986646652416473e-06, + "loss": 0.7166, + "step": 2409 + }, + { + "epoch": 1.195606305076331, + "grad_norm": 0.13666015376761742, + "learning_rate": 3.985860770954867e-06, + "loss": 0.6994, + "step": 2410 + }, + { + "epoch": 1.1961027677795706, + "grad_norm": 0.14270391448274597, + "learning_rate": 3.985074662403267e-06, + "loss": 0.7084, + "step": 2411 + }, + { + "epoch": 1.19659923048281, + "grad_norm": 0.13549585466845787, + "learning_rate": 3.984288326881817e-06, + "loss": 0.7065, + "step": 2412 + }, + { + "epoch": 1.1970956931860495, + "grad_norm": 0.13648442428428606, + "learning_rate": 3.983501764510693e-06, + "loss": 0.7372, + "step": 2413 + }, + { + "epoch": 1.1975921558892888, + "grad_norm": 0.14086948426447465, + "learning_rate": 3.982714975410111e-06, + "loss": 0.7205, + "step": 2414 + }, + { + "epoch": 1.1980886185925281, + "grad_norm": 0.1325423947366576, + "learning_rate": 3.9819279597003195e-06, + "loss": 0.6983, + "step": 2415 + }, + { + "epoch": 1.1985850812957677, + "grad_norm": 0.13634160260964182, + "learning_rate": 3.9811407175015995e-06, + "loss": 0.6579, + "step": 2416 + }, + { + "epoch": 1.199081543999007, + "grad_norm": 0.13874737933942757, + "learning_rate": 3.980353248934269e-06, + "loss": 0.7378, + "step": 2417 + }, + { + "epoch": 1.1995780067022466, + "grad_norm": 0.13788946383443576, + "learning_rate": 3.9795655541186805e-06, + "loss": 0.7417, + "step": 2418 + }, + { + "epoch": 1.200074469405486, + "grad_norm": 0.13647093705243393, + "learning_rate": 3.97877763317522e-06, + "loss": 0.7199, + "step": 2419 + }, + { + "epoch": 1.2005709321087252, + "grad_norm": 0.14052612906793216, + "learning_rate": 3.977989486224309e-06, + "loss": 0.7305, + "step": 2420 + }, + { + "epoch": 1.2010673948119648, + "grad_norm": 0.14559651220194383, + "learning_rate": 3.977201113386402e-06, + "loss": 0.7112, + "step": 2421 + }, + { + "epoch": 1.2015638575152041, + "grad_norm": 0.14825330957397817, + "learning_rate": 3.97641251478199e-06, + "loss": 0.798, + "step": 2422 + }, + { + "epoch": 1.2020603202184437, + "grad_norm": 0.1411793849816043, + "learning_rate": 3.975623690531598e-06, + "loss": 0.775, + "step": 2423 + }, + { + "epoch": 1.202556782921683, + "grad_norm": 0.13329544412504718, + "learning_rate": 3.9748346407557845e-06, + "loss": 0.7021, + "step": 2424 + }, + { + "epoch": 1.2030532456249223, + "grad_norm": 0.1322912568134602, + "learning_rate": 3.9740453655751435e-06, + "loss": 0.7112, + "step": 2425 + }, + { + "epoch": 1.2035497083281619, + "grad_norm": 0.1387834596545233, + "learning_rate": 3.973255865110302e-06, + "loss": 0.698, + "step": 2426 + }, + { + "epoch": 1.2040461710314012, + "grad_norm": 0.14956785941966386, + "learning_rate": 3.972466139481925e-06, + "loss": 0.7206, + "step": 2427 + }, + { + "epoch": 1.2045426337346408, + "grad_norm": 0.1322262204706779, + "learning_rate": 3.971676188810707e-06, + "loss": 0.7204, + "step": 2428 + }, + { + "epoch": 1.20503909643788, + "grad_norm": 0.12896570477009364, + "learning_rate": 3.9708860132173795e-06, + "loss": 0.695, + "step": 2429 + }, + { + "epoch": 1.2055355591411194, + "grad_norm": 0.13536258893402728, + "learning_rate": 3.97009561282271e-06, + "loss": 0.7283, + "step": 2430 + }, + { + "epoch": 1.206032021844359, + "grad_norm": 0.13492371176463214, + "learning_rate": 3.969304987747496e-06, + "loss": 0.7073, + "step": 2431 + }, + { + "epoch": 1.2065284845475983, + "grad_norm": 0.13134753297863244, + "learning_rate": 3.9685141381125745e-06, + "loss": 0.7089, + "step": 2432 + }, + { + "epoch": 1.2070249472508379, + "grad_norm": 0.13554849626488005, + "learning_rate": 3.967723064038812e-06, + "loss": 0.735, + "step": 2433 + }, + { + "epoch": 1.2075214099540772, + "grad_norm": 0.14349704386244447, + "learning_rate": 3.966931765647112e-06, + "loss": 0.7232, + "step": 2434 + }, + { + "epoch": 1.2080178726573165, + "grad_norm": 0.1385163792731118, + "learning_rate": 3.966140243058413e-06, + "loss": 0.7218, + "step": 2435 + }, + { + "epoch": 1.208514335360556, + "grad_norm": 0.14279978031188895, + "learning_rate": 3.965348496393685e-06, + "loss": 0.8045, + "step": 2436 + }, + { + "epoch": 1.2090107980637954, + "grad_norm": 0.13858682489268342, + "learning_rate": 3.964556525773935e-06, + "loss": 0.7069, + "step": 2437 + }, + { + "epoch": 1.209507260767035, + "grad_norm": 0.1275555575643711, + "learning_rate": 3.963764331320201e-06, + "loss": 0.7232, + "step": 2438 + }, + { + "epoch": 1.2100037234702743, + "grad_norm": 0.13319214619249178, + "learning_rate": 3.9629719131535595e-06, + "loss": 0.7551, + "step": 2439 + }, + { + "epoch": 1.2105001861735136, + "grad_norm": 0.1387567942220467, + "learning_rate": 3.962179271395118e-06, + "loss": 0.7367, + "step": 2440 + }, + { + "epoch": 1.2109966488767532, + "grad_norm": 0.1373104804870825, + "learning_rate": 3.961386406166019e-06, + "loss": 0.7157, + "step": 2441 + }, + { + "epoch": 1.2114931115799925, + "grad_norm": 0.13779972795149475, + "learning_rate": 3.96059331758744e-06, + "loss": 0.7441, + "step": 2442 + }, + { + "epoch": 1.211989574283232, + "grad_norm": 0.13902570136362163, + "learning_rate": 3.9598000057805905e-06, + "loss": 0.7297, + "step": 2443 + }, + { + "epoch": 1.2124860369864714, + "grad_norm": 0.14200167374497805, + "learning_rate": 3.959006470866717e-06, + "loss": 0.7664, + "step": 2444 + }, + { + "epoch": 1.2129824996897107, + "grad_norm": 0.13182479218295906, + "learning_rate": 3.958212712967097e-06, + "loss": 0.7333, + "step": 2445 + }, + { + "epoch": 1.2134789623929503, + "grad_norm": 0.13500957253720283, + "learning_rate": 3.957418732203045e-06, + "loss": 0.7128, + "step": 2446 + }, + { + "epoch": 1.2139754250961896, + "grad_norm": 0.1364082757123999, + "learning_rate": 3.9566245286959074e-06, + "loss": 0.7516, + "step": 2447 + }, + { + "epoch": 1.2144718877994292, + "grad_norm": 0.15584714093121874, + "learning_rate": 3.955830102567066e-06, + "loss": 0.7542, + "step": 2448 + }, + { + "epoch": 1.2149683505026685, + "grad_norm": 0.13446548054487187, + "learning_rate": 3.955035453937936e-06, + "loss": 0.7358, + "step": 2449 + }, + { + "epoch": 1.2154648132059078, + "grad_norm": 0.13145486309075377, + "learning_rate": 3.954240582929965e-06, + "loss": 0.7025, + "step": 2450 + }, + { + "epoch": 1.2159612759091474, + "grad_norm": 0.13574735830422388, + "learning_rate": 3.953445489664641e-06, + "loss": 0.7401, + "step": 2451 + }, + { + "epoch": 1.2164577386123867, + "grad_norm": 0.1366039565699845, + "learning_rate": 3.952650174263476e-06, + "loss": 0.7242, + "step": 2452 + }, + { + "epoch": 1.2169542013156263, + "grad_norm": 0.13329175729167014, + "learning_rate": 3.9518546368480235e-06, + "loss": 0.701, + "step": 2453 + }, + { + "epoch": 1.2174506640188656, + "grad_norm": 0.13635014090129313, + "learning_rate": 3.951058877539869e-06, + "loss": 0.7085, + "step": 2454 + }, + { + "epoch": 1.217947126722105, + "grad_norm": 0.14550135826438018, + "learning_rate": 3.95026289646063e-06, + "loss": 0.7563, + "step": 2455 + }, + { + "epoch": 1.2184435894253445, + "grad_norm": 0.13708956139933903, + "learning_rate": 3.949466693731962e-06, + "loss": 0.7396, + "step": 2456 + }, + { + "epoch": 1.2189400521285838, + "grad_norm": 0.13788736852218864, + "learning_rate": 3.948670269475549e-06, + "loss": 0.7099, + "step": 2457 + }, + { + "epoch": 1.2194365148318234, + "grad_norm": 0.1472672823984794, + "learning_rate": 3.947873623813114e-06, + "loss": 0.7551, + "step": 2458 + }, + { + "epoch": 1.2199329775350627, + "grad_norm": 0.13567982397574574, + "learning_rate": 3.94707675686641e-06, + "loss": 0.7215, + "step": 2459 + }, + { + "epoch": 1.220429440238302, + "grad_norm": 0.12785953951320136, + "learning_rate": 3.946279668757226e-06, + "loss": 0.6893, + "step": 2460 + }, + { + "epoch": 1.2209259029415416, + "grad_norm": 0.139042104896544, + "learning_rate": 3.945482359607383e-06, + "loss": 0.7116, + "step": 2461 + }, + { + "epoch": 1.221422365644781, + "grad_norm": 0.1310146478620868, + "learning_rate": 3.94468482953874e-06, + "loss": 0.6821, + "step": 2462 + }, + { + "epoch": 1.2219188283480205, + "grad_norm": 0.14080053665645972, + "learning_rate": 3.943887078673182e-06, + "loss": 0.72, + "step": 2463 + }, + { + "epoch": 1.2224152910512598, + "grad_norm": 0.1374193970967945, + "learning_rate": 3.943089107132637e-06, + "loss": 0.7277, + "step": 2464 + }, + { + "epoch": 1.2229117537544991, + "grad_norm": 0.13142687505073025, + "learning_rate": 3.942290915039059e-06, + "loss": 0.7756, + "step": 2465 + }, + { + "epoch": 1.2234082164577387, + "grad_norm": 0.1383208661807758, + "learning_rate": 3.94149250251444e-06, + "loss": 0.7427, + "step": 2466 + }, + { + "epoch": 1.223904679160978, + "grad_norm": 0.13946709784881417, + "learning_rate": 3.940693869680805e-06, + "loss": 0.7091, + "step": 2467 + }, + { + "epoch": 1.2244011418642176, + "grad_norm": 0.13594769238575358, + "learning_rate": 3.939895016660212e-06, + "loss": 0.7248, + "step": 2468 + }, + { + "epoch": 1.2248976045674569, + "grad_norm": 0.13618933752020215, + "learning_rate": 3.939095943574752e-06, + "loss": 0.7535, + "step": 2469 + }, + { + "epoch": 1.2253940672706962, + "grad_norm": 0.14274958557643477, + "learning_rate": 3.938296650546552e-06, + "loss": 0.7819, + "step": 2470 + }, + { + "epoch": 1.2258905299739358, + "grad_norm": 0.13553582376705678, + "learning_rate": 3.93749713769777e-06, + "loss": 0.7561, + "step": 2471 + }, + { + "epoch": 1.226386992677175, + "grad_norm": 0.14247983844696746, + "learning_rate": 3.936697405150599e-06, + "loss": 0.7169, + "step": 2472 + }, + { + "epoch": 1.2268834553804147, + "grad_norm": 0.13401472437678552, + "learning_rate": 3.935897453027265e-06, + "loss": 0.7331, + "step": 2473 + }, + { + "epoch": 1.227379918083654, + "grad_norm": 0.13467273748712041, + "learning_rate": 3.935097281450027e-06, + "loss": 0.7224, + "step": 2474 + }, + { + "epoch": 1.2278763807868933, + "grad_norm": 0.13626379999904872, + "learning_rate": 3.934296890541182e-06, + "loss": 0.7443, + "step": 2475 + }, + { + "epoch": 1.2283728434901329, + "grad_norm": 0.13003861644898487, + "learning_rate": 3.933496280423054e-06, + "loss": 0.6546, + "step": 2476 + }, + { + "epoch": 1.2288693061933722, + "grad_norm": 0.1425981221162396, + "learning_rate": 3.932695451218003e-06, + "loss": 0.7141, + "step": 2477 + }, + { + "epoch": 1.2293657688966118, + "grad_norm": 0.13560701136791695, + "learning_rate": 3.931894403048424e-06, + "loss": 0.7208, + "step": 2478 + }, + { + "epoch": 1.229862231599851, + "grad_norm": 0.1401873943379598, + "learning_rate": 3.931093136036744e-06, + "loss": 0.7266, + "step": 2479 + }, + { + "epoch": 1.2303586943030904, + "grad_norm": 0.13728694609128989, + "learning_rate": 3.930291650305424e-06, + "loss": 0.7134, + "step": 2480 + }, + { + "epoch": 1.23085515700633, + "grad_norm": 0.13218011780198854, + "learning_rate": 3.929489945976959e-06, + "loss": 0.7239, + "step": 2481 + }, + { + "epoch": 1.2313516197095693, + "grad_norm": 0.1410943689944911, + "learning_rate": 3.928688023173875e-06, + "loss": 0.7284, + "step": 2482 + }, + { + "epoch": 1.2318480824128089, + "grad_norm": 0.1361830396710614, + "learning_rate": 3.927885882018735e-06, + "loss": 0.7211, + "step": 2483 + }, + { + "epoch": 1.2323445451160482, + "grad_norm": 0.1351899227297423, + "learning_rate": 3.927083522634132e-06, + "loss": 0.7422, + "step": 2484 + }, + { + "epoch": 1.2328410078192875, + "grad_norm": 0.1406973288937524, + "learning_rate": 3.926280945142693e-06, + "loss": 0.7229, + "step": 2485 + }, + { + "epoch": 1.233337470522527, + "grad_norm": 0.13754991570942163, + "learning_rate": 3.925478149667081e-06, + "loss": 0.7677, + "step": 2486 + }, + { + "epoch": 1.2338339332257664, + "grad_norm": 0.1384634788260933, + "learning_rate": 3.92467513632999e-06, + "loss": 0.7408, + "step": 2487 + }, + { + "epoch": 1.234330395929006, + "grad_norm": 0.14392311337089933, + "learning_rate": 3.923871905254146e-06, + "loss": 0.7471, + "step": 2488 + }, + { + "epoch": 1.2348268586322453, + "grad_norm": 0.1335035566492651, + "learning_rate": 3.9230684565623135e-06, + "loss": 0.6966, + "step": 2489 + }, + { + "epoch": 1.2353233213354846, + "grad_norm": 0.132634223213086, + "learning_rate": 3.922264790377283e-06, + "loss": 0.6934, + "step": 2490 + }, + { + "epoch": 1.2358197840387242, + "grad_norm": 0.13827031271642745, + "learning_rate": 3.921460906821884e-06, + "loss": 0.7098, + "step": 2491 + }, + { + "epoch": 1.2363162467419635, + "grad_norm": 0.14451954689789367, + "learning_rate": 3.920656806018977e-06, + "loss": 0.788, + "step": 2492 + }, + { + "epoch": 1.236812709445203, + "grad_norm": 0.13561380554064417, + "learning_rate": 3.919852488091455e-06, + "loss": 0.7527, + "step": 2493 + }, + { + "epoch": 1.2373091721484424, + "grad_norm": 0.13501364970911942, + "learning_rate": 3.9190479531622465e-06, + "loss": 0.7612, + "step": 2494 + }, + { + "epoch": 1.2378056348516817, + "grad_norm": 0.12818687643314722, + "learning_rate": 3.91824320135431e-06, + "loss": 0.6963, + "step": 2495 + }, + { + "epoch": 1.2383020975549213, + "grad_norm": 0.13312040030918668, + "learning_rate": 3.9174382327906415e-06, + "loss": 0.7064, + "step": 2496 + }, + { + "epoch": 1.2387985602581606, + "grad_norm": 0.13300761423052482, + "learning_rate": 3.916633047594265e-06, + "loss": 0.7077, + "step": 2497 + }, + { + "epoch": 1.2392950229614, + "grad_norm": 0.13620840402675596, + "learning_rate": 3.915827645888242e-06, + "loss": 0.7187, + "step": 2498 + }, + { + "epoch": 1.2397914856646395, + "grad_norm": 0.13426305401154814, + "learning_rate": 3.915022027795663e-06, + "loss": 0.69, + "step": 2499 + }, + { + "epoch": 1.2402879483678788, + "grad_norm": 0.1464897445532207, + "learning_rate": 3.914216193439657e-06, + "loss": 0.8251, + "step": 2500 + }, + { + "epoch": 1.2407844110711184, + "grad_norm": 0.13743582313648897, + "learning_rate": 3.91341014294338e-06, + "loss": 0.7444, + "step": 2501 + }, + { + "epoch": 1.2412808737743577, + "grad_norm": 0.13874526596758197, + "learning_rate": 3.912603876430025e-06, + "loss": 0.708, + "step": 2502 + }, + { + "epoch": 1.241777336477597, + "grad_norm": 0.13008996722051902, + "learning_rate": 3.9117973940228166e-06, + "loss": 0.7709, + "step": 2503 + }, + { + "epoch": 1.2422737991808366, + "grad_norm": 0.1355480166714516, + "learning_rate": 3.910990695845013e-06, + "loss": 0.6943, + "step": 2504 + }, + { + "epoch": 1.242770261884076, + "grad_norm": 0.14539960501531302, + "learning_rate": 3.910183782019905e-06, + "loss": 0.7631, + "step": 2505 + }, + { + "epoch": 1.2432667245873155, + "grad_norm": 0.14083590125770834, + "learning_rate": 3.909376652670818e-06, + "loss": 0.7338, + "step": 2506 + }, + { + "epoch": 1.2437631872905548, + "grad_norm": 0.1431658480547782, + "learning_rate": 3.908569307921106e-06, + "loss": 0.8085, + "step": 2507 + }, + { + "epoch": 1.2442596499937941, + "grad_norm": 0.14578702074830277, + "learning_rate": 3.90776174789416e-06, + "loss": 0.7466, + "step": 2508 + }, + { + "epoch": 1.2447561126970337, + "grad_norm": 0.13316331073208876, + "learning_rate": 3.906953972713403e-06, + "loss": 0.7069, + "step": 2509 + }, + { + "epoch": 1.245252575400273, + "grad_norm": 0.13867830074059864, + "learning_rate": 3.90614598250229e-06, + "loss": 0.7267, + "step": 2510 + }, + { + "epoch": 1.2457490381035126, + "grad_norm": 0.13076874574140407, + "learning_rate": 3.905337777384308e-06, + "loss": 0.698, + "step": 2511 + }, + { + "epoch": 1.246245500806752, + "grad_norm": 0.13962139881766297, + "learning_rate": 3.904529357482981e-06, + "loss": 0.7147, + "step": 2512 + }, + { + "epoch": 1.2467419635099912, + "grad_norm": 0.1462052190977223, + "learning_rate": 3.9037207229218615e-06, + "loss": 0.728, + "step": 2513 + }, + { + "epoch": 1.2472384262132308, + "grad_norm": 0.1418049627962229, + "learning_rate": 3.902911873824536e-06, + "loss": 0.7354, + "step": 2514 + }, + { + "epoch": 1.24773488891647, + "grad_norm": 0.14139015990729262, + "learning_rate": 3.902102810314625e-06, + "loss": 0.736, + "step": 2515 + }, + { + "epoch": 1.2482313516197097, + "grad_norm": 0.13974548545259946, + "learning_rate": 3.9012935325157805e-06, + "loss": 0.735, + "step": 2516 + }, + { + "epoch": 1.248727814322949, + "grad_norm": 0.14137472030628268, + "learning_rate": 3.900484040551688e-06, + "loss": 0.7538, + "step": 2517 + }, + { + "epoch": 1.2492242770261883, + "grad_norm": 0.13631373740449088, + "learning_rate": 3.899674334546064e-06, + "loss": 0.6965, + "step": 2518 + }, + { + "epoch": 1.2497207397294279, + "grad_norm": 0.13581041061864152, + "learning_rate": 3.898864414622661e-06, + "loss": 0.7229, + "step": 2519 + }, + { + "epoch": 1.2502172024326672, + "grad_norm": 0.1351856956359364, + "learning_rate": 3.89805428090526e-06, + "loss": 0.6548, + "step": 2520 + }, + { + "epoch": 1.2502172024326672, + "eval_loss": 0.742116391658783, + "eval_runtime": 135.9502, + "eval_samples_per_second": 223.266, + "eval_steps_per_second": 27.915, + "step": 2520 + }, + { + "epoch": 1.2507136651359065, + "grad_norm": 0.1384586104535219, + "learning_rate": 3.897243933517679e-06, + "loss": 0.7376, + "step": 2521 + }, + { + "epoch": 1.251210127839146, + "grad_norm": 0.16315907780591418, + "learning_rate": 3.896433372583766e-06, + "loss": 0.7179, + "step": 2522 + }, + { + "epoch": 1.2517065905423854, + "grad_norm": 0.13165553758104281, + "learning_rate": 3.895622598227402e-06, + "loss": 0.6968, + "step": 2523 + }, + { + "epoch": 1.252203053245625, + "grad_norm": 0.1468700776665726, + "learning_rate": 3.894811610572501e-06, + "loss": 0.7961, + "step": 2524 + }, + { + "epoch": 1.2526995159488643, + "grad_norm": 0.13484457344159942, + "learning_rate": 3.894000409743009e-06, + "loss": 0.6972, + "step": 2525 + }, + { + "epoch": 1.2531959786521036, + "grad_norm": 0.14298803461222828, + "learning_rate": 3.893188995862907e-06, + "loss": 0.7434, + "step": 2526 + }, + { + "epoch": 1.2536924413553432, + "grad_norm": 0.13247384088857173, + "learning_rate": 3.892377369056203e-06, + "loss": 0.697, + "step": 2527 + }, + { + "epoch": 1.2541889040585825, + "grad_norm": 0.12855401841158223, + "learning_rate": 3.8915655294469445e-06, + "loss": 0.7368, + "step": 2528 + }, + { + "epoch": 1.254685366761822, + "grad_norm": 0.1399262509376029, + "learning_rate": 3.890753477159206e-06, + "loss": 0.7499, + "step": 2529 + }, + { + "epoch": 1.2551818294650614, + "grad_norm": 0.1532990862064815, + "learning_rate": 3.8899412123170984e-06, + "loss": 0.7841, + "step": 2530 + }, + { + "epoch": 1.2556782921683007, + "grad_norm": 0.13912282728279968, + "learning_rate": 3.889128735044762e-06, + "loss": 0.7458, + "step": 2531 + }, + { + "epoch": 1.2561747548715403, + "grad_norm": 0.1369771636022506, + "learning_rate": 3.888316045466372e-06, + "loss": 0.7154, + "step": 2532 + }, + { + "epoch": 1.2566712175747796, + "grad_norm": 0.13198426469797134, + "learning_rate": 3.887503143706134e-06, + "loss": 0.674, + "step": 2533 + }, + { + "epoch": 1.2571676802780192, + "grad_norm": 0.14611135628302505, + "learning_rate": 3.886690029888287e-06, + "loss": 0.7117, + "step": 2534 + }, + { + "epoch": 1.2576641429812585, + "grad_norm": 0.1385484671417284, + "learning_rate": 3.885876704137104e-06, + "loss": 0.7399, + "step": 2535 + }, + { + "epoch": 1.2581606056844978, + "grad_norm": 0.13166836823073533, + "learning_rate": 3.885063166576886e-06, + "loss": 0.7609, + "step": 2536 + }, + { + "epoch": 1.2586570683877374, + "grad_norm": 0.1387003911974032, + "learning_rate": 3.8842494173319726e-06, + "loss": 0.725, + "step": 2537 + }, + { + "epoch": 1.2591535310909767, + "grad_norm": 0.13305768328582457, + "learning_rate": 3.883435456526728e-06, + "loss": 0.7375, + "step": 2538 + }, + { + "epoch": 1.2596499937942163, + "grad_norm": 0.13978466433671136, + "learning_rate": 3.882621284285558e-06, + "loss": 0.7691, + "step": 2539 + }, + { + "epoch": 1.2601464564974556, + "grad_norm": 0.13757446247537838, + "learning_rate": 3.881806900732893e-06, + "loss": 0.7069, + "step": 2540 + }, + { + "epoch": 1.260642919200695, + "grad_norm": 0.14032437189289912, + "learning_rate": 3.880992305993198e-06, + "loss": 0.7281, + "step": 2541 + }, + { + "epoch": 1.2611393819039345, + "grad_norm": 0.1377868179608081, + "learning_rate": 3.880177500190971e-06, + "loss": 0.7587, + "step": 2542 + }, + { + "epoch": 1.2616358446071738, + "grad_norm": 0.14861646208837478, + "learning_rate": 3.8793624834507435e-06, + "loss": 0.7251, + "step": 2543 + }, + { + "epoch": 1.2621323073104134, + "grad_norm": 0.13517554466674267, + "learning_rate": 3.8785472558970776e-06, + "loss": 0.6974, + "step": 2544 + }, + { + "epoch": 1.2626287700136527, + "grad_norm": 0.1309887066083378, + "learning_rate": 3.877731817654566e-06, + "loss": 0.7402, + "step": 2545 + }, + { + "epoch": 1.263125232716892, + "grad_norm": 0.1396787950926557, + "learning_rate": 3.876916168847836e-06, + "loss": 0.7621, + "step": 2546 + }, + { + "epoch": 1.2636216954201316, + "grad_norm": 0.14129831083302125, + "learning_rate": 3.876100309601547e-06, + "loss": 0.8199, + "step": 2547 + }, + { + "epoch": 1.264118158123371, + "grad_norm": 0.14102270939055808, + "learning_rate": 3.87528424004039e-06, + "loss": 0.7137, + "step": 2548 + }, + { + "epoch": 1.2646146208266105, + "grad_norm": 0.13270563152474793, + "learning_rate": 3.874467960289088e-06, + "loss": 0.6974, + "step": 2549 + }, + { + "epoch": 1.2651110835298498, + "grad_norm": 0.14077175671585102, + "learning_rate": 3.8736514704723956e-06, + "loss": 0.7488, + "step": 2550 + }, + { + "epoch": 1.2656075462330891, + "grad_norm": 0.14294951439526327, + "learning_rate": 3.872834770715102e-06, + "loss": 0.7407, + "step": 2551 + }, + { + "epoch": 1.2661040089363287, + "grad_norm": 0.14108160173034903, + "learning_rate": 3.872017861142024e-06, + "loss": 0.7285, + "step": 2552 + }, + { + "epoch": 1.266600471639568, + "grad_norm": 0.14172910962233856, + "learning_rate": 3.871200741878015e-06, + "loss": 0.7434, + "step": 2553 + }, + { + "epoch": 1.2670969343428076, + "grad_norm": 0.1418472981583555, + "learning_rate": 3.870383413047959e-06, + "loss": 0.693, + "step": 2554 + }, + { + "epoch": 1.267593397046047, + "grad_norm": 0.14824626937177632, + "learning_rate": 3.86956587477677e-06, + "loss": 0.7304, + "step": 2555 + }, + { + "epoch": 1.2680898597492862, + "grad_norm": 0.14417792095901208, + "learning_rate": 3.868748127189397e-06, + "loss": 0.689, + "step": 2556 + }, + { + "epoch": 1.2685863224525258, + "grad_norm": 0.13628181036537815, + "learning_rate": 3.8679301704108176e-06, + "loss": 0.7269, + "step": 2557 + }, + { + "epoch": 1.2690827851557651, + "grad_norm": 0.1432671252492769, + "learning_rate": 3.8671120045660456e-06, + "loss": 0.7273, + "step": 2558 + }, + { + "epoch": 1.2695792478590047, + "grad_norm": 0.13729463835110303, + "learning_rate": 3.8662936297801235e-06, + "loss": 0.7118, + "step": 2559 + }, + { + "epoch": 1.270075710562244, + "grad_norm": 0.1458195417043056, + "learning_rate": 3.865475046178127e-06, + "loss": 0.7171, + "step": 2560 + }, + { + "epoch": 1.2705721732654833, + "grad_norm": 0.13880022855949167, + "learning_rate": 3.864656253885163e-06, + "loss": 0.6906, + "step": 2561 + }, + { + "epoch": 1.2710686359687229, + "grad_norm": 0.1349359231670376, + "learning_rate": 3.863837253026372e-06, + "loss": 0.7689, + "step": 2562 + }, + { + "epoch": 1.2715650986719622, + "grad_norm": 0.14164412974060797, + "learning_rate": 3.863018043726924e-06, + "loss": 0.6878, + "step": 2563 + }, + { + "epoch": 1.2720615613752018, + "grad_norm": 0.14323827765065936, + "learning_rate": 3.862198626112023e-06, + "loss": 0.7606, + "step": 2564 + }, + { + "epoch": 1.272558024078441, + "grad_norm": 0.1473424236959722, + "learning_rate": 3.861379000306902e-06, + "loss": 0.7605, + "step": 2565 + }, + { + "epoch": 1.2730544867816804, + "grad_norm": 0.1292191998705604, + "learning_rate": 3.8605591664368295e-06, + "loss": 0.7022, + "step": 2566 + }, + { + "epoch": 1.27355094948492, + "grad_norm": 0.13465698019970399, + "learning_rate": 3.859739124627103e-06, + "loss": 0.6971, + "step": 2567 + }, + { + "epoch": 1.2740474121881593, + "grad_norm": 0.13654253074049172, + "learning_rate": 3.858918875003053e-06, + "loss": 0.6928, + "step": 2568 + }, + { + "epoch": 1.2745438748913989, + "grad_norm": 0.1379719697081152, + "learning_rate": 3.858098417690042e-06, + "loss": 0.717, + "step": 2569 + }, + { + "epoch": 1.2750403375946382, + "grad_norm": 0.14175199937004399, + "learning_rate": 3.857277752813463e-06, + "loss": 0.7616, + "step": 2570 + }, + { + "epoch": 1.2755368002978775, + "grad_norm": 0.13443150257007153, + "learning_rate": 3.856456880498742e-06, + "loss": 0.6973, + "step": 2571 + }, + { + "epoch": 1.276033263001117, + "grad_norm": 0.14593077908065585, + "learning_rate": 3.855635800871335e-06, + "loss": 0.7569, + "step": 2572 + }, + { + "epoch": 1.2765297257043564, + "grad_norm": 0.15100942741983306, + "learning_rate": 3.854814514056734e-06, + "loss": 0.7642, + "step": 2573 + }, + { + "epoch": 1.277026188407596, + "grad_norm": 0.13642851129916278, + "learning_rate": 3.853993020180456e-06, + "loss": 0.7406, + "step": 2574 + }, + { + "epoch": 1.2775226511108353, + "grad_norm": 0.1400663992803376, + "learning_rate": 3.853171319368054e-06, + "loss": 0.7454, + "step": 2575 + }, + { + "epoch": 1.2780191138140746, + "grad_norm": 0.1319083373021114, + "learning_rate": 3.852349411745113e-06, + "loss": 0.74, + "step": 2576 + }, + { + "epoch": 1.2785155765173142, + "grad_norm": 0.14724655880988557, + "learning_rate": 3.851527297437247e-06, + "loss": 0.7528, + "step": 2577 + }, + { + "epoch": 1.2790120392205535, + "grad_norm": 0.13674211565816105, + "learning_rate": 3.8507049765701045e-06, + "loss": 0.6716, + "step": 2578 + }, + { + "epoch": 1.279508501923793, + "grad_norm": 0.1382868822761622, + "learning_rate": 3.849882449269363e-06, + "loss": 0.746, + "step": 2579 + }, + { + "epoch": 1.2800049646270324, + "grad_norm": 0.1390259790054548, + "learning_rate": 3.849059715660732e-06, + "loss": 0.7088, + "step": 2580 + }, + { + "epoch": 1.2805014273302717, + "grad_norm": 0.1369367923225582, + "learning_rate": 3.848236775869955e-06, + "loss": 0.7699, + "step": 2581 + }, + { + "epoch": 1.2809978900335113, + "grad_norm": 0.15971172103749032, + "learning_rate": 3.847413630022804e-06, + "loss": 0.746, + "step": 2582 + }, + { + "epoch": 1.2814943527367506, + "grad_norm": 0.14277522047325522, + "learning_rate": 3.846590278245083e-06, + "loss": 0.7567, + "step": 2583 + }, + { + "epoch": 1.2819908154399902, + "grad_norm": 0.14699081523439286, + "learning_rate": 3.8457667206626306e-06, + "loss": 0.7873, + "step": 2584 + }, + { + "epoch": 1.2824872781432295, + "grad_norm": 0.14366187562768565, + "learning_rate": 3.844942957401311e-06, + "loss": 0.6949, + "step": 2585 + }, + { + "epoch": 1.2829837408464688, + "grad_norm": 0.14181909585351757, + "learning_rate": 3.844118988587025e-06, + "loss": 0.7135, + "step": 2586 + }, + { + "epoch": 1.2834802035497084, + "grad_norm": 0.13109014026406526, + "learning_rate": 3.843294814345705e-06, + "loss": 0.6893, + "step": 2587 + }, + { + "epoch": 1.2839766662529477, + "grad_norm": 0.13263413654791315, + "learning_rate": 3.8424704348033084e-06, + "loss": 0.6963, + "step": 2588 + }, + { + "epoch": 1.2844731289561873, + "grad_norm": 0.13552375577647977, + "learning_rate": 3.841645850085831e-06, + "loss": 0.718, + "step": 2589 + }, + { + "epoch": 1.2849695916594266, + "grad_norm": 0.138899051483369, + "learning_rate": 3.840821060319298e-06, + "loss": 0.754, + "step": 2590 + }, + { + "epoch": 1.285466054362666, + "grad_norm": 0.14600374894252893, + "learning_rate": 3.839996065629764e-06, + "loss": 0.7421, + "step": 2591 + }, + { + "epoch": 1.2859625170659055, + "grad_norm": 0.14495501404251815, + "learning_rate": 3.839170866143317e-06, + "loss": 0.6984, + "step": 2592 + }, + { + "epoch": 1.2864589797691448, + "grad_norm": 0.13341183715508106, + "learning_rate": 3.838345461986074e-06, + "loss": 0.7205, + "step": 2593 + }, + { + "epoch": 1.2869554424723844, + "grad_norm": 0.13169443537805922, + "learning_rate": 3.837519853284186e-06, + "loss": 0.691, + "step": 2594 + }, + { + "epoch": 1.2874519051756237, + "grad_norm": 0.13790138352749554, + "learning_rate": 3.836694040163834e-06, + "loss": 0.6926, + "step": 2595 + }, + { + "epoch": 1.287948367878863, + "grad_norm": 0.1437529575160023, + "learning_rate": 3.835868022751231e-06, + "loss": 0.7934, + "step": 2596 + }, + { + "epoch": 1.2884448305821026, + "grad_norm": 0.13306894590543736, + "learning_rate": 3.835041801172619e-06, + "loss": 0.6854, + "step": 2597 + }, + { + "epoch": 1.288941293285342, + "grad_norm": 0.14395437206135459, + "learning_rate": 3.834215375554275e-06, + "loss": 0.7379, + "step": 2598 + }, + { + "epoch": 1.2894377559885815, + "grad_norm": 0.1378707356978414, + "learning_rate": 3.8333887460225015e-06, + "loss": 0.7178, + "step": 2599 + }, + { + "epoch": 1.2899342186918208, + "grad_norm": 0.13148423330705314, + "learning_rate": 3.832561912703638e-06, + "loss": 0.6924, + "step": 2600 + }, + { + "epoch": 1.2904306813950601, + "grad_norm": 0.13730143809759457, + "learning_rate": 3.831734875724052e-06, + "loss": 0.7503, + "step": 2601 + }, + { + "epoch": 1.2909271440982997, + "grad_norm": 0.13498579164231725, + "learning_rate": 3.830907635210143e-06, + "loss": 0.738, + "step": 2602 + }, + { + "epoch": 1.291423606801539, + "grad_norm": 0.13053407240787798, + "learning_rate": 3.830080191288342e-06, + "loss": 0.7039, + "step": 2603 + }, + { + "epoch": 1.2919200695047786, + "grad_norm": 0.13619974831779605, + "learning_rate": 3.82925254408511e-06, + "loss": 0.7462, + "step": 2604 + }, + { + "epoch": 1.2924165322080179, + "grad_norm": 0.1415670740958942, + "learning_rate": 3.828424693726939e-06, + "loss": 0.7302, + "step": 2605 + }, + { + "epoch": 1.2929129949112572, + "grad_norm": 0.14173779929791688, + "learning_rate": 3.827596640340353e-06, + "loss": 0.7732, + "step": 2606 + }, + { + "epoch": 1.2934094576144968, + "grad_norm": 0.1359458995344357, + "learning_rate": 3.826768384051907e-06, + "loss": 0.7372, + "step": 2607 + }, + { + "epoch": 1.293905920317736, + "grad_norm": 0.13416195193113473, + "learning_rate": 3.825939924988187e-06, + "loss": 0.7289, + "step": 2608 + }, + { + "epoch": 1.2944023830209757, + "grad_norm": 0.1337826798946644, + "learning_rate": 3.825111263275809e-06, + "loss": 0.6944, + "step": 2609 + }, + { + "epoch": 1.294898845724215, + "grad_norm": 0.13167566643050344, + "learning_rate": 3.824282399041421e-06, + "loss": 0.677, + "step": 2610 + }, + { + "epoch": 1.2953953084274543, + "grad_norm": 0.1432765949213179, + "learning_rate": 3.823453332411702e-06, + "loss": 0.7226, + "step": 2611 + }, + { + "epoch": 1.2958917711306939, + "grad_norm": 0.13409930276110446, + "learning_rate": 3.8226240635133615e-06, + "loss": 0.7633, + "step": 2612 + }, + { + "epoch": 1.2963882338339332, + "grad_norm": 0.14108249887977412, + "learning_rate": 3.8217945924731385e-06, + "loss": 0.7139, + "step": 2613 + }, + { + "epoch": 1.2968846965371728, + "grad_norm": 0.13605604217842482, + "learning_rate": 3.8209649194178065e-06, + "loss": 0.7486, + "step": 2614 + }, + { + "epoch": 1.297381159240412, + "grad_norm": 0.13562858110835124, + "learning_rate": 3.820135044474166e-06, + "loss": 0.7403, + "step": 2615 + }, + { + "epoch": 1.2978776219436514, + "grad_norm": 0.13982122188113263, + "learning_rate": 3.81930496776905e-06, + "loss": 0.7424, + "step": 2616 + }, + { + "epoch": 1.298374084646891, + "grad_norm": 0.13772514732208074, + "learning_rate": 3.818474689429324e-06, + "loss": 0.7531, + "step": 2617 + }, + { + "epoch": 1.2988705473501303, + "grad_norm": 0.13901119411689639, + "learning_rate": 3.817644209581881e-06, + "loss": 0.7395, + "step": 2618 + }, + { + "epoch": 1.2993670100533699, + "grad_norm": 0.1344248019109414, + "learning_rate": 3.8168135283536485e-06, + "loss": 0.732, + "step": 2619 + }, + { + "epoch": 1.2998634727566092, + "grad_norm": 0.13540832289022223, + "learning_rate": 3.815982645871582e-06, + "loss": 0.7063, + "step": 2620 + }, + { + "epoch": 1.3003599354598485, + "grad_norm": 0.13397167892750497, + "learning_rate": 3.815151562262666e-06, + "loss": 0.7302, + "step": 2621 + }, + { + "epoch": 1.300856398163088, + "grad_norm": 0.13514380521337327, + "learning_rate": 3.8143202776539224e-06, + "loss": 0.7267, + "step": 2622 + }, + { + "epoch": 1.3013528608663274, + "grad_norm": 0.1334082478010419, + "learning_rate": 3.8134887921723975e-06, + "loss": 0.7107, + "step": 2623 + }, + { + "epoch": 1.301849323569567, + "grad_norm": 0.1359740311554934, + "learning_rate": 3.812657105945171e-06, + "loss": 0.6509, + "step": 2624 + }, + { + "epoch": 1.3023457862728063, + "grad_norm": 0.1445356509888893, + "learning_rate": 3.8118252190993533e-06, + "loss": 0.7013, + "step": 2625 + }, + { + "epoch": 1.3028422489760456, + "grad_norm": 0.13788799234754298, + "learning_rate": 3.810993131762083e-06, + "loss": 0.7414, + "step": 2626 + }, + { + "epoch": 1.3033387116792852, + "grad_norm": 0.13253611567442633, + "learning_rate": 3.810160844060533e-06, + "loss": 0.7793, + "step": 2627 + }, + { + "epoch": 1.3038351743825245, + "grad_norm": 0.133375725717547, + "learning_rate": 3.8093283561219063e-06, + "loss": 0.7065, + "step": 2628 + }, + { + "epoch": 1.304331637085764, + "grad_norm": 0.14059616928874114, + "learning_rate": 3.808495668073432e-06, + "loss": 0.7164, + "step": 2629 + }, + { + "epoch": 1.3048280997890034, + "grad_norm": 0.1307516991478391, + "learning_rate": 3.807662780042376e-06, + "loss": 0.6775, + "step": 2630 + }, + { + "epoch": 1.3053245624922427, + "grad_norm": 0.14152325120856557, + "learning_rate": 3.806829692156031e-06, + "loss": 0.7759, + "step": 2631 + }, + { + "epoch": 1.3058210251954823, + "grad_norm": 0.12924559643345673, + "learning_rate": 3.8059964045417196e-06, + "loss": 0.6981, + "step": 2632 + }, + { + "epoch": 1.3063174878987216, + "grad_norm": 0.1370915470558916, + "learning_rate": 3.805162917326799e-06, + "loss": 0.7678, + "step": 2633 + }, + { + "epoch": 1.3068139506019611, + "grad_norm": 0.14767292411211091, + "learning_rate": 3.8043292306386528e-06, + "loss": 0.6807, + "step": 2634 + }, + { + "epoch": 1.3073104133052005, + "grad_norm": 0.13677942779385033, + "learning_rate": 3.8034953446046974e-06, + "loss": 0.7533, + "step": 2635 + }, + { + "epoch": 1.3078068760084398, + "grad_norm": 0.13632882033348248, + "learning_rate": 3.8026612593523795e-06, + "loss": 0.7246, + "step": 2636 + }, + { + "epoch": 1.3083033387116794, + "grad_norm": 0.13657887532594618, + "learning_rate": 3.801826975009173e-06, + "loss": 0.7122, + "step": 2637 + }, + { + "epoch": 1.3087998014149187, + "grad_norm": 0.13144574952305246, + "learning_rate": 3.8009924917025864e-06, + "loss": 0.727, + "step": 2638 + }, + { + "epoch": 1.3092962641181582, + "grad_norm": 0.13554336516404342, + "learning_rate": 3.8001578095601583e-06, + "loss": 0.6958, + "step": 2639 + }, + { + "epoch": 1.3097927268213976, + "grad_norm": 0.13859859321388554, + "learning_rate": 3.799322928709455e-06, + "loss": 0.7051, + "step": 2640 + }, + { + "epoch": 1.310289189524637, + "grad_norm": 0.1389305915074386, + "learning_rate": 3.7984878492780754e-06, + "loss": 0.7113, + "step": 2641 + }, + { + "epoch": 1.3107856522278765, + "grad_norm": 0.13940935351453346, + "learning_rate": 3.797652571393647e-06, + "loss": 0.7732, + "step": 2642 + }, + { + "epoch": 1.3112821149311158, + "grad_norm": 0.13336829577684228, + "learning_rate": 3.79681709518383e-06, + "loss": 0.7421, + "step": 2643 + }, + { + "epoch": 1.3117785776343553, + "grad_norm": 0.1338637968337137, + "learning_rate": 3.7959814207763134e-06, + "loss": 0.7025, + "step": 2644 + }, + { + "epoch": 1.3122750403375947, + "grad_norm": 0.1322361693764335, + "learning_rate": 3.7951455482988154e-06, + "loss": 0.6567, + "step": 2645 + }, + { + "epoch": 1.312771503040834, + "grad_norm": 0.13690317157186413, + "learning_rate": 3.7943094778790866e-06, + "loss": 0.7347, + "step": 2646 + }, + { + "epoch": 1.3132679657440733, + "grad_norm": 0.13886094412827465, + "learning_rate": 3.7934732096449066e-06, + "loss": 0.7281, + "step": 2647 + }, + { + "epoch": 1.313764428447313, + "grad_norm": 0.1433387530701494, + "learning_rate": 3.792636743724085e-06, + "loss": 0.7451, + "step": 2648 + }, + { + "epoch": 1.3142608911505524, + "grad_norm": 0.13856198601421107, + "learning_rate": 3.7918000802444644e-06, + "loss": 0.7467, + "step": 2649 + }, + { + "epoch": 1.3147573538537918, + "grad_norm": 0.13420789080751208, + "learning_rate": 3.790963219333913e-06, + "loss": 0.7276, + "step": 2650 + }, + { + "epoch": 1.315253816557031, + "grad_norm": 0.1291136871280018, + "learning_rate": 3.790126161120333e-06, + "loss": 0.6971, + "step": 2651 + }, + { + "epoch": 1.3157502792602704, + "grad_norm": 0.1290712942967362, + "learning_rate": 3.789288905731655e-06, + "loss": 0.7456, + "step": 2652 + }, + { + "epoch": 1.31624674196351, + "grad_norm": 0.13062887909116458, + "learning_rate": 3.78845145329584e-06, + "loss": 0.7069, + "step": 2653 + }, + { + "epoch": 1.3167432046667495, + "grad_norm": 0.13036053551686685, + "learning_rate": 3.7876138039408784e-06, + "loss": 0.7227, + "step": 2654 + }, + { + "epoch": 1.3172396673699889, + "grad_norm": 0.13222431704901577, + "learning_rate": 3.786775957794793e-06, + "loss": 0.7432, + "step": 2655 + }, + { + "epoch": 1.3177361300732282, + "grad_norm": 0.13104899861223418, + "learning_rate": 3.7859379149856335e-06, + "loss": 0.7501, + "step": 2656 + }, + { + "epoch": 1.3182325927764675, + "grad_norm": 0.13397270135551584, + "learning_rate": 3.7850996756414832e-06, + "loss": 0.7326, + "step": 2657 + }, + { + "epoch": 1.318729055479707, + "grad_norm": 0.13613150488476347, + "learning_rate": 3.7842612398904515e-06, + "loss": 0.7147, + "step": 2658 + }, + { + "epoch": 1.3192255181829466, + "grad_norm": 0.14339490468621394, + "learning_rate": 3.783422607860681e-06, + "loss": 0.7324, + "step": 2659 + }, + { + "epoch": 1.319721980886186, + "grad_norm": 0.13378917893428685, + "learning_rate": 3.7825837796803438e-06, + "loss": 0.7268, + "step": 2660 + }, + { + "epoch": 1.3202184435894253, + "grad_norm": 0.13360282147099303, + "learning_rate": 3.7817447554776397e-06, + "loss": 0.7304, + "step": 2661 + }, + { + "epoch": 1.3207149062926646, + "grad_norm": 0.13904280037330674, + "learning_rate": 3.780905535380801e-06, + "loss": 0.7572, + "step": 2662 + }, + { + "epoch": 1.3212113689959042, + "grad_norm": 0.13011808148888576, + "learning_rate": 3.78006611951809e-06, + "loss": 0.6836, + "step": 2663 + }, + { + "epoch": 1.3217078316991435, + "grad_norm": 0.13510584692748015, + "learning_rate": 3.779226508017796e-06, + "loss": 0.7061, + "step": 2664 + }, + { + "epoch": 1.322204294402383, + "grad_norm": 0.1371054013890361, + "learning_rate": 3.778386701008241e-06, + "loss": 0.751, + "step": 2665 + }, + { + "epoch": 1.3227007571056224, + "grad_norm": 0.13439056933714283, + "learning_rate": 3.7775466986177763e-06, + "loss": 0.7382, + "step": 2666 + }, + { + "epoch": 1.3231972198088617, + "grad_norm": 0.13676259742243488, + "learning_rate": 3.776706500974783e-06, + "loss": 0.7439, + "step": 2667 + }, + { + "epoch": 1.3236936825121013, + "grad_norm": 0.1384039615266504, + "learning_rate": 3.775866108207671e-06, + "loss": 0.7215, + "step": 2668 + }, + { + "epoch": 1.3241901452153406, + "grad_norm": 0.14031483696025457, + "learning_rate": 3.7750255204448817e-06, + "loss": 0.7596, + "step": 2669 + }, + { + "epoch": 1.3246866079185802, + "grad_norm": 0.13734515606529832, + "learning_rate": 3.7741847378148845e-06, + "loss": 0.7511, + "step": 2670 + }, + { + "epoch": 1.3251830706218195, + "grad_norm": 0.130610747808502, + "learning_rate": 3.7733437604461804e-06, + "loss": 0.675, + "step": 2671 + }, + { + "epoch": 1.3256795333250588, + "grad_norm": 0.13229432283223813, + "learning_rate": 3.7725025884672987e-06, + "loss": 0.7204, + "step": 2672 + }, + { + "epoch": 1.3261759960282984, + "grad_norm": 0.13459703835228942, + "learning_rate": 3.7716612220068004e-06, + "loss": 0.6899, + "step": 2673 + }, + { + "epoch": 1.3266724587315377, + "grad_norm": 0.13416054723437704, + "learning_rate": 3.770819661193273e-06, + "loss": 0.6853, + "step": 2674 + }, + { + "epoch": 1.3271689214347773, + "grad_norm": 0.1361667362155641, + "learning_rate": 3.7699779061553365e-06, + "loss": 0.7009, + "step": 2675 + }, + { + "epoch": 1.3276653841380166, + "grad_norm": 0.13873053615880002, + "learning_rate": 3.7691359570216404e-06, + "loss": 0.6775, + "step": 2676 + }, + { + "epoch": 1.328161846841256, + "grad_norm": 0.13977475275303883, + "learning_rate": 3.7682938139208615e-06, + "loss": 0.7638, + "step": 2677 + }, + { + "epoch": 1.3286583095444955, + "grad_norm": 0.13414664404632431, + "learning_rate": 3.7674514769817083e-06, + "loss": 0.7539, + "step": 2678 + }, + { + "epoch": 1.3291547722477348, + "grad_norm": 0.14091362715741365, + "learning_rate": 3.7666089463329196e-06, + "loss": 0.746, + "step": 2679 + }, + { + "epoch": 1.3296512349509744, + "grad_norm": 0.1470455702766219, + "learning_rate": 3.765766222103262e-06, + "loss": 0.7381, + "step": 2680 + }, + { + "epoch": 1.3301476976542137, + "grad_norm": 0.13534175218735514, + "learning_rate": 3.7649233044215314e-06, + "loss": 0.7117, + "step": 2681 + }, + { + "epoch": 1.330644160357453, + "grad_norm": 0.13478773220973986, + "learning_rate": 3.764080193416556e-06, + "loss": 0.7127, + "step": 2682 + }, + { + "epoch": 1.3311406230606926, + "grad_norm": 0.14594060460245392, + "learning_rate": 3.7632368892171916e-06, + "loss": 0.7218, + "step": 2683 + }, + { + "epoch": 1.331637085763932, + "grad_norm": 0.13227694228798864, + "learning_rate": 3.7623933919523226e-06, + "loss": 0.6883, + "step": 2684 + }, + { + "epoch": 1.3321335484671715, + "grad_norm": 0.1331372361004289, + "learning_rate": 3.761549701750865e-06, + "loss": 0.7251, + "step": 2685 + }, + { + "epoch": 1.3326300111704108, + "grad_norm": 0.13633213773374722, + "learning_rate": 3.7607058187417624e-06, + "loss": 0.6906, + "step": 2686 + }, + { + "epoch": 1.3331264738736501, + "grad_norm": 0.14075233741360202, + "learning_rate": 3.7598617430539886e-06, + "loss": 0.7518, + "step": 2687 + }, + { + "epoch": 1.3336229365768897, + "grad_norm": 0.14126613421524764, + "learning_rate": 3.7590174748165487e-06, + "loss": 0.7323, + "step": 2688 + }, + { + "epoch": 1.334119399280129, + "grad_norm": 0.13817784028859825, + "learning_rate": 3.758173014158475e-06, + "loss": 0.7186, + "step": 2689 + }, + { + "epoch": 1.3346158619833686, + "grad_norm": 0.14776049205061048, + "learning_rate": 3.757328361208828e-06, + "loss": 0.7606, + "step": 2690 + }, + { + "epoch": 1.335112324686608, + "grad_norm": 0.1335821510798814, + "learning_rate": 3.756483516096702e-06, + "loss": 0.7053, + "step": 2691 + }, + { + "epoch": 1.3356087873898472, + "grad_norm": 0.1383320848994581, + "learning_rate": 3.7556384789512156e-06, + "loss": 0.7501, + "step": 2692 + }, + { + "epoch": 1.3361052500930868, + "grad_norm": 0.14093892215142187, + "learning_rate": 3.754793249901521e-06, + "loss": 0.7381, + "step": 2693 + }, + { + "epoch": 1.3366017127963261, + "grad_norm": 0.13669736220023862, + "learning_rate": 3.753947829076797e-06, + "loss": 0.6916, + "step": 2694 + }, + { + "epoch": 1.3370981754995657, + "grad_norm": 0.13362283897482466, + "learning_rate": 3.7531022166062538e-06, + "loss": 0.7439, + "step": 2695 + }, + { + "epoch": 1.337594638202805, + "grad_norm": 0.13205258206921125, + "learning_rate": 3.7522564126191276e-06, + "loss": 0.6928, + "step": 2696 + }, + { + "epoch": 1.3380911009060443, + "grad_norm": 0.13796484578752577, + "learning_rate": 3.751410417244687e-06, + "loss": 0.8034, + "step": 2697 + }, + { + "epoch": 1.3385875636092839, + "grad_norm": 0.13414239398576636, + "learning_rate": 3.7505642306122293e-06, + "loss": 0.7181, + "step": 2698 + }, + { + "epoch": 1.3390840263125232, + "grad_norm": 0.14145403843670953, + "learning_rate": 3.7497178528510803e-06, + "loss": 0.691, + "step": 2699 + }, + { + "epoch": 1.3395804890157628, + "grad_norm": 0.1379866463002123, + "learning_rate": 3.7488712840905955e-06, + "loss": 0.7399, + "step": 2700 + }, + { + "epoch": 1.340076951719002, + "grad_norm": 0.13204289710931297, + "learning_rate": 3.7480245244601587e-06, + "loss": 0.7253, + "step": 2701 + }, + { + "epoch": 1.3405734144222414, + "grad_norm": 0.1397985609990718, + "learning_rate": 3.747177574089184e-06, + "loss": 0.7438, + "step": 2702 + }, + { + "epoch": 1.341069877125481, + "grad_norm": 0.14161488684867624, + "learning_rate": 3.746330433107114e-06, + "loss": 0.7537, + "step": 2703 + }, + { + "epoch": 1.3415663398287203, + "grad_norm": 0.14437747804639947, + "learning_rate": 3.7454831016434206e-06, + "loss": 0.7241, + "step": 2704 + }, + { + "epoch": 1.3420628025319599, + "grad_norm": 0.13861952176789427, + "learning_rate": 3.744635579827606e-06, + "loss": 0.7398, + "step": 2705 + }, + { + "epoch": 1.3425592652351992, + "grad_norm": 0.1399777307376627, + "learning_rate": 3.7437878677891977e-06, + "loss": 0.7055, + "step": 2706 + }, + { + "epoch": 1.3430557279384385, + "grad_norm": 0.1370426735312947, + "learning_rate": 3.742939965657757e-06, + "loss": 0.7354, + "step": 2707 + }, + { + "epoch": 1.343552190641678, + "grad_norm": 0.1457622692943864, + "learning_rate": 3.7420918735628714e-06, + "loss": 0.7592, + "step": 2708 + }, + { + "epoch": 1.3440486533449174, + "grad_norm": 0.1301536298141118, + "learning_rate": 3.741243591634159e-06, + "loss": 0.7435, + "step": 2709 + }, + { + "epoch": 1.344545116048157, + "grad_norm": 0.13359769838918073, + "learning_rate": 3.7403951200012645e-06, + "loss": 0.7271, + "step": 2710 + }, + { + "epoch": 1.3450415787513963, + "grad_norm": 0.1361918708494249, + "learning_rate": 3.7395464587938652e-06, + "loss": 0.7421, + "step": 2711 + }, + { + "epoch": 1.3455380414546356, + "grad_norm": 0.13334017361026135, + "learning_rate": 3.738697608141664e-06, + "loss": 0.6886, + "step": 2712 + }, + { + "epoch": 1.3460345041578752, + "grad_norm": 0.1387171887689253, + "learning_rate": 3.7378485681743935e-06, + "loss": 0.6873, + "step": 2713 + }, + { + "epoch": 1.3465309668611145, + "grad_norm": 0.14983174029057572, + "learning_rate": 3.7369993390218172e-06, + "loss": 0.7513, + "step": 2714 + }, + { + "epoch": 1.347027429564354, + "grad_norm": 0.1424875756746005, + "learning_rate": 3.736149920813726e-06, + "loss": 0.6993, + "step": 2715 + }, + { + "epoch": 1.3475238922675934, + "grad_norm": 0.13814327332815962, + "learning_rate": 3.7353003136799394e-06, + "loss": 0.7383, + "step": 2716 + }, + { + "epoch": 1.3480203549708327, + "grad_norm": 0.13291744742349632, + "learning_rate": 3.7344505177503064e-06, + "loss": 0.6902, + "step": 2717 + }, + { + "epoch": 1.3485168176740723, + "grad_norm": 0.1294966956204128, + "learning_rate": 3.733600533154705e-06, + "loss": 0.6753, + "step": 2718 + }, + { + "epoch": 1.3490132803773116, + "grad_norm": 0.13384401858376338, + "learning_rate": 3.732750360023041e-06, + "loss": 0.7194, + "step": 2719 + }, + { + "epoch": 1.3495097430805512, + "grad_norm": 0.1510726665603985, + "learning_rate": 3.73189999848525e-06, + "loss": 0.7286, + "step": 2720 + }, + { + "epoch": 1.3500062057837905, + "grad_norm": 0.17389508520864652, + "learning_rate": 3.7310494486712966e-06, + "loss": 0.7334, + "step": 2721 + }, + { + "epoch": 1.3505026684870298, + "grad_norm": 0.14457583333245747, + "learning_rate": 3.730198710711173e-06, + "loss": 0.7759, + "step": 2722 + }, + { + "epoch": 1.3509991311902694, + "grad_norm": 0.1304606464207145, + "learning_rate": 3.729347784734901e-06, + "loss": 0.7004, + "step": 2723 + }, + { + "epoch": 1.3514955938935087, + "grad_norm": 0.13664237040256322, + "learning_rate": 3.7284966708725316e-06, + "loss": 0.7148, + "step": 2724 + }, + { + "epoch": 1.3519920565967483, + "grad_norm": 0.1319353306336256, + "learning_rate": 3.727645369254144e-06, + "loss": 0.7099, + "step": 2725 + }, + { + "epoch": 1.3524885192999876, + "grad_norm": 0.13747258866291462, + "learning_rate": 3.7267938800098454e-06, + "loss": 0.7371, + "step": 2726 + }, + { + "epoch": 1.352984982003227, + "grad_norm": 0.13461034808749692, + "learning_rate": 3.7259422032697724e-06, + "loss": 0.7092, + "step": 2727 + }, + { + "epoch": 1.3534814447064665, + "grad_norm": 0.13212884182845225, + "learning_rate": 3.7250903391640893e-06, + "loss": 0.69, + "step": 2728 + }, + { + "epoch": 1.3539779074097058, + "grad_norm": 0.14143560310569692, + "learning_rate": 3.724238287822991e-06, + "loss": 0.7601, + "step": 2729 + }, + { + "epoch": 1.3544743701129454, + "grad_norm": 0.13608277738602498, + "learning_rate": 3.723386049376699e-06, + "loss": 0.7386, + "step": 2730 + }, + { + "epoch": 1.3549708328161847, + "grad_norm": 0.13308627959339595, + "learning_rate": 3.7225336239554655e-06, + "loss": 0.6834, + "step": 2731 + }, + { + "epoch": 1.355467295519424, + "grad_norm": 0.13859104882727302, + "learning_rate": 3.72168101168957e-06, + "loss": 0.7224, + "step": 2732 + }, + { + "epoch": 1.3559637582226636, + "grad_norm": 0.13567729073653412, + "learning_rate": 3.7208282127093197e-06, + "loss": 0.6887, + "step": 2733 + }, + { + "epoch": 1.356460220925903, + "grad_norm": 0.13365628178016073, + "learning_rate": 3.7199752271450514e-06, + "loss": 0.7077, + "step": 2734 + }, + { + "epoch": 1.3569566836291425, + "grad_norm": 0.13570402215298025, + "learning_rate": 3.71912205512713e-06, + "loss": 0.709, + "step": 2735 + }, + { + "epoch": 1.3574531463323818, + "grad_norm": 0.13867332477147382, + "learning_rate": 3.71826869678595e-06, + "loss": 0.7279, + "step": 2736 + }, + { + "epoch": 1.3579496090356211, + "grad_norm": 0.14117778740626497, + "learning_rate": 3.717415152251933e-06, + "loss": 0.7077, + "step": 2737 + }, + { + "epoch": 1.3584460717388607, + "grad_norm": 0.13314975615492863, + "learning_rate": 3.71656142165553e-06, + "loss": 0.7517, + "step": 2738 + }, + { + "epoch": 1.3589425344421, + "grad_norm": 0.13579612596374233, + "learning_rate": 3.7157075051272196e-06, + "loss": 0.7127, + "step": 2739 + }, + { + "epoch": 1.3594389971453396, + "grad_norm": 0.13878702089854905, + "learning_rate": 3.71485340279751e-06, + "loss": 0.7418, + "step": 2740 + }, + { + "epoch": 1.3599354598485789, + "grad_norm": 0.14239282194169628, + "learning_rate": 3.7139991147969363e-06, + "loss": 0.7374, + "step": 2741 + }, + { + "epoch": 1.3604319225518182, + "grad_norm": 0.14056420662523464, + "learning_rate": 3.7131446412560624e-06, + "loss": 0.7458, + "step": 2742 + }, + { + "epoch": 1.3609283852550578, + "grad_norm": 0.13228428795282451, + "learning_rate": 3.7122899823054815e-06, + "loss": 0.7236, + "step": 2743 + }, + { + "epoch": 1.361424847958297, + "grad_norm": 0.13281946270383405, + "learning_rate": 3.7114351380758145e-06, + "loss": 0.6975, + "step": 2744 + }, + { + "epoch": 1.3619213106615367, + "grad_norm": 0.13670672603836637, + "learning_rate": 3.71058010869771e-06, + "loss": 0.7332, + "step": 2745 + }, + { + "epoch": 1.362417773364776, + "grad_norm": 0.14071995024167427, + "learning_rate": 3.7097248943018467e-06, + "loss": 0.7082, + "step": 2746 + }, + { + "epoch": 1.3629142360680153, + "grad_norm": 0.13414480406046128, + "learning_rate": 3.7088694950189297e-06, + "loss": 0.7126, + "step": 2747 + }, + { + "epoch": 1.3634106987712549, + "grad_norm": 0.14516674208748181, + "learning_rate": 3.7080139109796933e-06, + "loss": 0.7322, + "step": 2748 + }, + { + "epoch": 1.3639071614744942, + "grad_norm": 0.13482497758832893, + "learning_rate": 3.7071581423148996e-06, + "loss": 0.7323, + "step": 2749 + }, + { + "epoch": 1.3644036241777338, + "grad_norm": 0.13236613491037902, + "learning_rate": 3.7063021891553384e-06, + "loss": 0.7301, + "step": 2750 + }, + { + "epoch": 1.364900086880973, + "grad_norm": 0.13863145388334375, + "learning_rate": 3.7054460516318302e-06, + "loss": 0.6891, + "step": 2751 + }, + { + "epoch": 1.3653965495842124, + "grad_norm": 0.1305802200578501, + "learning_rate": 3.7045897298752196e-06, + "loss": 0.754, + "step": 2752 + }, + { + "epoch": 1.365893012287452, + "grad_norm": 0.1320898504801369, + "learning_rate": 3.703733224016384e-06, + "loss": 0.7232, + "step": 2753 + }, + { + "epoch": 1.3663894749906913, + "grad_norm": 0.1382071992088785, + "learning_rate": 3.7028765341862256e-06, + "loss": 0.6633, + "step": 2754 + }, + { + "epoch": 1.3668859376939309, + "grad_norm": 0.13239411486453712, + "learning_rate": 3.702019660515675e-06, + "loss": 0.6814, + "step": 2755 + }, + { + "epoch": 1.3673824003971702, + "grad_norm": 0.13222359579600135, + "learning_rate": 3.7011626031356924e-06, + "loss": 0.7755, + "step": 2756 + }, + { + "epoch": 1.3678788631004095, + "grad_norm": 0.15096634032752865, + "learning_rate": 3.7003053621772655e-06, + "loss": 0.7288, + "step": 2757 + }, + { + "epoch": 1.368375325803649, + "grad_norm": 0.13367062633330973, + "learning_rate": 3.699447937771409e-06, + "loss": 0.7281, + "step": 2758 + }, + { + "epoch": 1.3688717885068884, + "grad_norm": 0.13930251553734874, + "learning_rate": 3.698590330049167e-06, + "loss": 0.7536, + "step": 2759 + }, + { + "epoch": 1.369368251210128, + "grad_norm": 0.1333947524333695, + "learning_rate": 3.697732539141611e-06, + "loss": 0.7267, + "step": 2760 + }, + { + "epoch": 1.3698647139133673, + "grad_norm": 0.13373105854598036, + "learning_rate": 3.6968745651798404e-06, + "loss": 0.7416, + "step": 2761 + }, + { + "epoch": 1.3703611766166066, + "grad_norm": 0.15126475875952416, + "learning_rate": 3.6960164082949827e-06, + "loss": 0.755, + "step": 2762 + }, + { + "epoch": 1.3708576393198462, + "grad_norm": 0.32186904582209985, + "learning_rate": 3.6951580686181944e-06, + "loss": 0.7316, + "step": 2763 + }, + { + "epoch": 1.3713541020230855, + "grad_norm": 0.1458070721124448, + "learning_rate": 3.6942995462806574e-06, + "loss": 0.7678, + "step": 2764 + }, + { + "epoch": 1.371850564726325, + "grad_norm": 0.13289186710198236, + "learning_rate": 3.693440841413585e-06, + "loss": 0.7255, + "step": 2765 + }, + { + "epoch": 1.3723470274295644, + "grad_norm": 0.14607683472265695, + "learning_rate": 3.6925819541482142e-06, + "loss": 0.7241, + "step": 2766 + }, + { + "epoch": 1.3728434901328037, + "grad_norm": 0.13565639692027404, + "learning_rate": 3.691722884615814e-06, + "loss": 0.7772, + "step": 2767 + }, + { + "epoch": 1.3733399528360433, + "grad_norm": 0.1395165105118244, + "learning_rate": 3.690863632947678e-06, + "loss": 0.7336, + "step": 2768 + }, + { + "epoch": 1.3738364155392826, + "grad_norm": 0.13691536096706877, + "learning_rate": 3.69000419927513e-06, + "loss": 0.7331, + "step": 2769 + }, + { + "epoch": 1.3743328782425221, + "grad_norm": 0.13120428601535297, + "learning_rate": 3.6891445837295215e-06, + "loss": 0.7361, + "step": 2770 + }, + { + "epoch": 1.3748293409457615, + "grad_norm": 0.13462822014791612, + "learning_rate": 3.6882847864422287e-06, + "loss": 0.7201, + "step": 2771 + }, + { + "epoch": 1.3753258036490008, + "grad_norm": 0.15260067191313678, + "learning_rate": 3.687424807544659e-06, + "loss": 0.7355, + "step": 2772 + }, + { + "epoch": 1.3758222663522404, + "grad_norm": 0.1356268994256984, + "learning_rate": 3.686564647168247e-06, + "loss": 0.6913, + "step": 2773 + }, + { + "epoch": 1.3763187290554797, + "grad_norm": 0.1317137965843151, + "learning_rate": 3.6857043054444534e-06, + "loss": 0.6801, + "step": 2774 + }, + { + "epoch": 1.3768151917587192, + "grad_norm": 0.13155836504147503, + "learning_rate": 3.6848437825047678e-06, + "loss": 0.701, + "step": 2775 + }, + { + "epoch": 1.3773116544619586, + "grad_norm": 0.1271126456023588, + "learning_rate": 3.6839830784807086e-06, + "loss": 0.6839, + "step": 2776 + }, + { + "epoch": 1.377808117165198, + "grad_norm": 0.14089310593605184, + "learning_rate": 3.6831221935038185e-06, + "loss": 0.7237, + "step": 2777 + }, + { + "epoch": 1.3783045798684375, + "grad_norm": 0.13973014329965902, + "learning_rate": 3.682261127705671e-06, + "loss": 0.6941, + "step": 2778 + }, + { + "epoch": 1.3788010425716768, + "grad_norm": 0.12910171833031905, + "learning_rate": 3.6813998812178665e-06, + "loss": 0.6799, + "step": 2779 + }, + { + "epoch": 1.3792975052749163, + "grad_norm": 0.13356773763189392, + "learning_rate": 3.680538454172033e-06, + "loss": 0.6942, + "step": 2780 + }, + { + "epoch": 1.3797939679781557, + "grad_norm": 0.14461559430138288, + "learning_rate": 3.6796768466998256e-06, + "loss": 0.728, + "step": 2781 + }, + { + "epoch": 1.380290430681395, + "grad_norm": 0.14044495895998144, + "learning_rate": 3.678815058932926e-06, + "loss": 0.7567, + "step": 2782 + }, + { + "epoch": 1.3807868933846346, + "grad_norm": 0.13731082360762506, + "learning_rate": 3.6779530910030455e-06, + "loss": 0.715, + "step": 2783 + }, + { + "epoch": 1.381283356087874, + "grad_norm": 0.1445938769341219, + "learning_rate": 3.6770909430419216e-06, + "loss": 0.7439, + "step": 2784 + }, + { + "epoch": 1.3817798187911134, + "grad_norm": 0.13607123879339897, + "learning_rate": 3.6762286151813207e-06, + "loss": 0.7089, + "step": 2785 + }, + { + "epoch": 1.3822762814943528, + "grad_norm": 0.1425234259681637, + "learning_rate": 3.6753661075530363e-06, + "loss": 0.7555, + "step": 2786 + }, + { + "epoch": 1.382772744197592, + "grad_norm": 0.13691972240341396, + "learning_rate": 3.6745034202888868e-06, + "loss": 0.6762, + "step": 2787 + }, + { + "epoch": 1.3832692069008314, + "grad_norm": 0.1359578557968317, + "learning_rate": 3.6736405535207215e-06, + "loss": 0.6988, + "step": 2788 + }, + { + "epoch": 1.383765669604071, + "grad_norm": 0.14251525701942244, + "learning_rate": 3.672777507380416e-06, + "loss": 0.8063, + "step": 2789 + }, + { + "epoch": 1.3842621323073105, + "grad_norm": 0.1361191613459699, + "learning_rate": 3.671914281999872e-06, + "loss": 0.7197, + "step": 2790 + }, + { + "epoch": 1.3847585950105499, + "grad_norm": 0.13112963174458994, + "learning_rate": 3.6710508775110204e-06, + "loss": 0.7006, + "step": 2791 + }, + { + "epoch": 1.3852550577137892, + "grad_norm": 0.13997501686830185, + "learning_rate": 3.670187294045819e-06, + "loss": 0.7215, + "step": 2792 + }, + { + "epoch": 1.3857515204170285, + "grad_norm": 0.13735338515895265, + "learning_rate": 3.6693235317362513e-06, + "loss": 0.6828, + "step": 2793 + }, + { + "epoch": 1.386247983120268, + "grad_norm": 0.13804648308059947, + "learning_rate": 3.6684595907143307e-06, + "loss": 0.6705, + "step": 2794 + }, + { + "epoch": 1.3867444458235076, + "grad_norm": 0.14859762749120925, + "learning_rate": 3.6675954711120964e-06, + "loss": 0.816, + "step": 2795 + }, + { + "epoch": 1.387240908526747, + "grad_norm": 0.13538015684912, + "learning_rate": 3.666731173061616e-06, + "loss": 0.7397, + "step": 2796 + }, + { + "epoch": 1.3877373712299863, + "grad_norm": 0.1310415765785277, + "learning_rate": 3.6658666966949823e-06, + "loss": 0.6973, + "step": 2797 + }, + { + "epoch": 1.3882338339332256, + "grad_norm": 0.13540679229302185, + "learning_rate": 3.665002042144318e-06, + "loss": 0.7459, + "step": 2798 + }, + { + "epoch": 1.3887302966364652, + "grad_norm": 0.14371652911821897, + "learning_rate": 3.6641372095417703e-06, + "loss": 0.7883, + "step": 2799 + }, + { + "epoch": 1.3892267593397047, + "grad_norm": 0.13703857411739195, + "learning_rate": 3.663272199019516e-06, + "loss": 0.6883, + "step": 2800 + }, + { + "epoch": 1.389723222042944, + "grad_norm": 0.13173661216385477, + "learning_rate": 3.662407010709757e-06, + "loss": 0.6969, + "step": 2801 + }, + { + "epoch": 1.3902196847461834, + "grad_norm": 0.12981624688845955, + "learning_rate": 3.661541644744725e-06, + "loss": 0.6822, + "step": 2802 + }, + { + "epoch": 1.3907161474494227, + "grad_norm": 0.13538269173616568, + "learning_rate": 3.660676101256676e-06, + "loss": 0.6703, + "step": 2803 + }, + { + "epoch": 1.3912126101526623, + "grad_norm": 0.13513544679451667, + "learning_rate": 3.659810380377895e-06, + "loss": 0.7126, + "step": 2804 + }, + { + "epoch": 1.3917090728559016, + "grad_norm": 0.13634117970671397, + "learning_rate": 3.6589444822406938e-06, + "loss": 0.7628, + "step": 2805 + }, + { + "epoch": 1.3922055355591412, + "grad_norm": 0.13469088579151892, + "learning_rate": 3.6580784069774104e-06, + "loss": 0.6827, + "step": 2806 + }, + { + "epoch": 1.3927019982623805, + "grad_norm": 0.14097471598925498, + "learning_rate": 3.657212154720411e-06, + "loss": 0.762, + "step": 2807 + }, + { + "epoch": 1.3931984609656198, + "grad_norm": 0.13688984690687683, + "learning_rate": 3.656345725602089e-06, + "loss": 0.723, + "step": 2808 + }, + { + "epoch": 1.3936949236688594, + "grad_norm": 0.13703297547170337, + "learning_rate": 3.6554791197548624e-06, + "loss": 0.7123, + "step": 2809 + }, + { + "epoch": 1.3941913863720987, + "grad_norm": 0.13555651922489106, + "learning_rate": 3.654612337311179e-06, + "loss": 0.7277, + "step": 2810 + }, + { + "epoch": 1.3946878490753383, + "grad_norm": 0.13430071149047185, + "learning_rate": 3.6537453784035133e-06, + "loss": 0.7369, + "step": 2811 + }, + { + "epoch": 1.3951843117785776, + "grad_norm": 0.1342596803538824, + "learning_rate": 3.6528782431643652e-06, + "loss": 0.7032, + "step": 2812 + }, + { + "epoch": 1.395680774481817, + "grad_norm": 0.14006820605549658, + "learning_rate": 3.6520109317262624e-06, + "loss": 0.711, + "step": 2813 + }, + { + "epoch": 1.3961772371850565, + "grad_norm": 0.13345080058461264, + "learning_rate": 3.65114344422176e-06, + "loss": 0.7336, + "step": 2814 + }, + { + "epoch": 1.3966736998882958, + "grad_norm": 0.13603896308716434, + "learning_rate": 3.6502757807834392e-06, + "loss": 0.7395, + "step": 2815 + }, + { + "epoch": 1.3971701625915354, + "grad_norm": 0.15103712063012376, + "learning_rate": 3.6494079415439087e-06, + "loss": 0.7604, + "step": 2816 + }, + { + "epoch": 1.3976666252947747, + "grad_norm": 0.14018161536599755, + "learning_rate": 3.6485399266358033e-06, + "loss": 0.748, + "step": 2817 + }, + { + "epoch": 1.398163087998014, + "grad_norm": 0.13994970770354836, + "learning_rate": 3.6476717361917867e-06, + "loss": 0.7203, + "step": 2818 + }, + { + "epoch": 1.3986595507012536, + "grad_norm": 0.1316527442596197, + "learning_rate": 3.6468033703445456e-06, + "loss": 0.7076, + "step": 2819 + }, + { + "epoch": 1.399156013404493, + "grad_norm": 0.1369640741315148, + "learning_rate": 3.645934829226797e-06, + "loss": 0.7325, + "step": 2820 + }, + { + "epoch": 1.3996524761077325, + "grad_norm": 0.12996187838882917, + "learning_rate": 3.6450661129712837e-06, + "loss": 0.7624, + "step": 2821 + }, + { + "epoch": 1.4001489388109718, + "grad_norm": 0.1481187395661545, + "learning_rate": 3.644197221710775e-06, + "loss": 0.7542, + "step": 2822 + }, + { + "epoch": 1.4006454015142111, + "grad_norm": 0.13864790575291003, + "learning_rate": 3.6433281555780666e-06, + "loss": 0.7592, + "step": 2823 + }, + { + "epoch": 1.4011418642174507, + "grad_norm": 0.13723959229827493, + "learning_rate": 3.6424589147059817e-06, + "loss": 0.7052, + "step": 2824 + }, + { + "epoch": 1.40163832692069, + "grad_norm": 0.14601293853858663, + "learning_rate": 3.641589499227369e-06, + "loss": 0.7408, + "step": 2825 + }, + { + "epoch": 1.4021347896239296, + "grad_norm": 0.1381724745946492, + "learning_rate": 3.6407199092751055e-06, + "loss": 0.7931, + "step": 2826 + }, + { + "epoch": 1.402631252327169, + "grad_norm": 0.14149071737598315, + "learning_rate": 3.6398501449820937e-06, + "loss": 0.6858, + "step": 2827 + }, + { + "epoch": 1.4031277150304082, + "grad_norm": 0.13297864017352057, + "learning_rate": 3.638980206481264e-06, + "loss": 0.7066, + "step": 2828 + }, + { + "epoch": 1.4036241777336478, + "grad_norm": 0.13269406137410125, + "learning_rate": 3.638110093905572e-06, + "loss": 0.7504, + "step": 2829 + }, + { + "epoch": 1.4041206404368871, + "grad_norm": 0.13910245211978362, + "learning_rate": 3.6372398073880006e-06, + "loss": 0.7451, + "step": 2830 + }, + { + "epoch": 1.4046171031401267, + "grad_norm": 0.13944456861859922, + "learning_rate": 3.636369347061558e-06, + "loss": 0.7568, + "step": 2831 + }, + { + "epoch": 1.405113565843366, + "grad_norm": 0.13566144602099214, + "learning_rate": 3.6354987130592814e-06, + "loss": 0.7047, + "step": 2832 + }, + { + "epoch": 1.4056100285466053, + "grad_norm": 0.13220561100847655, + "learning_rate": 3.634627905514232e-06, + "loss": 0.7132, + "step": 2833 + }, + { + "epoch": 1.4061064912498449, + "grad_norm": 0.13332119323744976, + "learning_rate": 3.6337569245595007e-06, + "loss": 0.6862, + "step": 2834 + }, + { + "epoch": 1.4066029539530842, + "grad_norm": 0.14000698566512068, + "learning_rate": 3.632885770328202e-06, + "loss": 0.7582, + "step": 2835 + }, + { + "epoch": 1.4070994166563238, + "grad_norm": 0.1366951831127539, + "learning_rate": 3.6320144429534764e-06, + "loss": 0.7625, + "step": 2836 + }, + { + "epoch": 1.407595879359563, + "grad_norm": 0.13797275616542223, + "learning_rate": 3.631142942568495e-06, + "loss": 0.715, + "step": 2837 + }, + { + "epoch": 1.4080923420628024, + "grad_norm": 0.13373290920899406, + "learning_rate": 3.630271269306451e-06, + "loss": 0.6955, + "step": 2838 + }, + { + "epoch": 1.408588804766042, + "grad_norm": 0.13580255325284915, + "learning_rate": 3.629399423300566e-06, + "loss": 0.7029, + "step": 2839 + }, + { + "epoch": 1.4090852674692813, + "grad_norm": 0.13617036873069288, + "learning_rate": 3.628527404684088e-06, + "loss": 0.7225, + "step": 2840 + }, + { + "epoch": 1.4095817301725209, + "grad_norm": 0.1303825802583835, + "learning_rate": 3.6276552135902897e-06, + "loss": 0.7193, + "step": 2841 + }, + { + "epoch": 1.4100781928757602, + "grad_norm": 0.13392854436162474, + "learning_rate": 3.626782850152473e-06, + "loss": 0.6943, + "step": 2842 + }, + { + "epoch": 1.4105746555789995, + "grad_norm": 0.13806544562439935, + "learning_rate": 3.625910314503965e-06, + "loss": 0.7319, + "step": 2843 + }, + { + "epoch": 1.411071118282239, + "grad_norm": 0.14112019047589364, + "learning_rate": 3.625037606778117e-06, + "loss": 0.7376, + "step": 2844 + }, + { + "epoch": 1.4115675809854784, + "grad_norm": 0.1293965423295043, + "learning_rate": 3.62416472710831e-06, + "loss": 0.6752, + "step": 2845 + }, + { + "epoch": 1.412064043688718, + "grad_norm": 0.13458946982919687, + "learning_rate": 3.6232916756279497e-06, + "loss": 0.7362, + "step": 2846 + }, + { + "epoch": 1.4125605063919573, + "grad_norm": 0.13524959671009265, + "learning_rate": 3.6224184524704665e-06, + "loss": 0.7239, + "step": 2847 + }, + { + "epoch": 1.4130569690951966, + "grad_norm": 0.13614099418088066, + "learning_rate": 3.6215450577693196e-06, + "loss": 0.6924, + "step": 2848 + }, + { + "epoch": 1.4135534317984362, + "grad_norm": 0.1392998440162914, + "learning_rate": 3.6206714916579925e-06, + "loss": 0.7753, + "step": 2849 + }, + { + "epoch": 1.4140498945016755, + "grad_norm": 0.1391830902646412, + "learning_rate": 3.6197977542699974e-06, + "loss": 0.7125, + "step": 2850 + }, + { + "epoch": 1.414546357204915, + "grad_norm": 0.13496515125842937, + "learning_rate": 3.6189238457388704e-06, + "loss": 0.7242, + "step": 2851 + }, + { + "epoch": 1.4150428199081544, + "grad_norm": 0.13540843902758287, + "learning_rate": 3.6180497661981733e-06, + "loss": 0.7189, + "step": 2852 + }, + { + "epoch": 1.4155392826113937, + "grad_norm": 0.1387877546863783, + "learning_rate": 3.617175515781497e-06, + "loss": 0.7011, + "step": 2853 + }, + { + "epoch": 1.4160357453146333, + "grad_norm": 0.132229382194975, + "learning_rate": 3.6163010946224552e-06, + "loss": 0.7071, + "step": 2854 + }, + { + "epoch": 1.4165322080178726, + "grad_norm": 0.13602711318145774, + "learning_rate": 3.615426502854689e-06, + "loss": 0.6891, + "step": 2855 + }, + { + "epoch": 1.4170286707211122, + "grad_norm": 0.13600373841927585, + "learning_rate": 3.6145517406118673e-06, + "loss": 0.7299, + "step": 2856 + }, + { + "epoch": 1.4175251334243515, + "grad_norm": 0.1324714432015326, + "learning_rate": 3.613676808027682e-06, + "loss": 0.715, + "step": 2857 + }, + { + "epoch": 1.4180215961275908, + "grad_norm": 0.132452376428356, + "learning_rate": 3.6128017052358535e-06, + "loss": 0.7385, + "step": 2858 + }, + { + "epoch": 1.4185180588308304, + "grad_norm": 0.13456443333348478, + "learning_rate": 3.6119264323701257e-06, + "loss": 0.7005, + "step": 2859 + }, + { + "epoch": 1.4190145215340697, + "grad_norm": 0.13156032063720458, + "learning_rate": 3.611050989564272e-06, + "loss": 0.7155, + "step": 2860 + }, + { + "epoch": 1.4195109842373093, + "grad_norm": 0.1342914875531962, + "learning_rate": 3.6101753769520885e-06, + "loss": 0.7072, + "step": 2861 + }, + { + "epoch": 1.4200074469405486, + "grad_norm": 0.13305702646687323, + "learning_rate": 3.6092995946673996e-06, + "loss": 0.7399, + "step": 2862 + }, + { + "epoch": 1.420503909643788, + "grad_norm": 0.1429132617790171, + "learning_rate": 3.608423642844053e-06, + "loss": 0.7405, + "step": 2863 + }, + { + "epoch": 1.4210003723470275, + "grad_norm": 0.14321697351882565, + "learning_rate": 3.607547521615926e-06, + "loss": 0.7462, + "step": 2864 + }, + { + "epoch": 1.4214968350502668, + "grad_norm": 0.13562736080521035, + "learning_rate": 3.6066712311169173e-06, + "loss": 0.7409, + "step": 2865 + }, + { + "epoch": 1.4219932977535064, + "grad_norm": 0.1411773380972186, + "learning_rate": 3.6057947714809555e-06, + "loss": 0.7401, + "step": 2866 + }, + { + "epoch": 1.4224897604567457, + "grad_norm": 0.1339410402597349, + "learning_rate": 3.6049181428419935e-06, + "loss": 0.7163, + "step": 2867 + }, + { + "epoch": 1.422986223159985, + "grad_norm": 0.13416362073643048, + "learning_rate": 3.6040413453340085e-06, + "loss": 0.6948, + "step": 2868 + }, + { + "epoch": 1.4234826858632246, + "grad_norm": 0.13676566176317306, + "learning_rate": 3.6031643790910066e-06, + "loss": 0.6875, + "step": 2869 + }, + { + "epoch": 1.423979148566464, + "grad_norm": 0.1313168534405185, + "learning_rate": 3.602287244247017e-06, + "loss": 0.6881, + "step": 2870 + }, + { + "epoch": 1.4244756112697035, + "grad_norm": 0.13677303706245855, + "learning_rate": 3.6014099409360955e-06, + "loss": 0.7393, + "step": 2871 + }, + { + "epoch": 1.4249720739729428, + "grad_norm": 0.14114326056602117, + "learning_rate": 3.6005324692923242e-06, + "loss": 0.6949, + "step": 2872 + }, + { + "epoch": 1.4254685366761821, + "grad_norm": 0.13688796725400607, + "learning_rate": 3.5996548294498113e-06, + "loss": 0.7323, + "step": 2873 + }, + { + "epoch": 1.4259649993794217, + "grad_norm": 0.13491954912768386, + "learning_rate": 3.598777021542689e-06, + "loss": 0.7144, + "step": 2874 + }, + { + "epoch": 1.426461462082661, + "grad_norm": 0.13401282593444483, + "learning_rate": 3.5978990457051165e-06, + "loss": 0.6912, + "step": 2875 + }, + { + "epoch": 1.4269579247859006, + "grad_norm": 0.1402341000199709, + "learning_rate": 3.597020902071278e-06, + "loss": 0.7434, + "step": 2876 + }, + { + "epoch": 1.4274543874891399, + "grad_norm": 0.14610852875970756, + "learning_rate": 3.596142590775385e-06, + "loss": 0.7442, + "step": 2877 + }, + { + "epoch": 1.4279508501923792, + "grad_norm": 0.1395377838118118, + "learning_rate": 3.5952641119516725e-06, + "loss": 0.7266, + "step": 2878 + }, + { + "epoch": 1.4284473128956188, + "grad_norm": 0.13617309441688394, + "learning_rate": 3.594385465734401e-06, + "loss": 0.7328, + "step": 2879 + }, + { + "epoch": 1.428943775598858, + "grad_norm": 0.14331534212615055, + "learning_rate": 3.5935066522578576e-06, + "loss": 0.7336, + "step": 2880 + }, + { + "epoch": 1.4294402383020977, + "grad_norm": 0.13574693488844447, + "learning_rate": 3.592627671656356e-06, + "loss": 0.6804, + "step": 2881 + }, + { + "epoch": 1.429936701005337, + "grad_norm": 0.14742713291670548, + "learning_rate": 3.5917485240642336e-06, + "loss": 0.7212, + "step": 2882 + }, + { + "epoch": 1.4304331637085763, + "grad_norm": 0.14811510498423192, + "learning_rate": 3.590869209615854e-06, + "loss": 0.7231, + "step": 2883 + }, + { + "epoch": 1.4309296264118159, + "grad_norm": 0.13396810414200716, + "learning_rate": 3.589989728445607e-06, + "loss": 0.7543, + "step": 2884 + }, + { + "epoch": 1.4314260891150552, + "grad_norm": 0.13821113307135227, + "learning_rate": 3.589110080687907e-06, + "loss": 0.7313, + "step": 2885 + }, + { + "epoch": 1.4319225518182948, + "grad_norm": 0.1403140258389397, + "learning_rate": 3.588230266477193e-06, + "loss": 0.7028, + "step": 2886 + }, + { + "epoch": 1.432419014521534, + "grad_norm": 0.13675009572376795, + "learning_rate": 3.5873502859479316e-06, + "loss": 0.7236, + "step": 2887 + }, + { + "epoch": 1.4329154772247734, + "grad_norm": 0.129493586039917, + "learning_rate": 3.5864701392346125e-06, + "loss": 0.7098, + "step": 2888 + }, + { + "epoch": 1.433411939928013, + "grad_norm": 0.13454419198703996, + "learning_rate": 3.5855898264717535e-06, + "loss": 0.6686, + "step": 2889 + }, + { + "epoch": 1.4339084026312523, + "grad_norm": 0.1347116060515282, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.7205, + "step": 2890 + }, + { + "epoch": 1.4344048653344919, + "grad_norm": 0.13371886505328962, + "learning_rate": 3.583828703335606e-06, + "loss": 0.7308, + "step": 2891 + }, + { + "epoch": 1.4349013280377312, + "grad_norm": 0.1312328405818565, + "learning_rate": 3.5829478932314763e-06, + "loss": 0.7005, + "step": 2892 + }, + { + "epoch": 1.4353977907409705, + "grad_norm": 0.13178342567206527, + "learning_rate": 3.582066917616126e-06, + "loss": 0.7287, + "step": 2893 + }, + { + "epoch": 1.43589425344421, + "grad_norm": 0.13275451628062787, + "learning_rate": 3.5811857766241966e-06, + "loss": 0.6984, + "step": 2894 + }, + { + "epoch": 1.4363907161474494, + "grad_norm": 0.1341420790665092, + "learning_rate": 3.5803044703903566e-06, + "loss": 0.6854, + "step": 2895 + }, + { + "epoch": 1.436887178850689, + "grad_norm": 0.13503560969815714, + "learning_rate": 3.579422999049299e-06, + "loss": 0.7089, + "step": 2896 + }, + { + "epoch": 1.4373836415539283, + "grad_norm": 0.14368925529103932, + "learning_rate": 3.578541362735744e-06, + "loss": 0.7332, + "step": 2897 + }, + { + "epoch": 1.4378801042571676, + "grad_norm": 0.13993715166267706, + "learning_rate": 3.5776595615844343e-06, + "loss": 0.7407, + "step": 2898 + }, + { + "epoch": 1.4383765669604072, + "grad_norm": 0.13262872626301542, + "learning_rate": 3.5767775957301402e-06, + "loss": 0.7408, + "step": 2899 + }, + { + "epoch": 1.4388730296636465, + "grad_norm": 0.12963084057116644, + "learning_rate": 3.575895465307655e-06, + "loss": 0.7321, + "step": 2900 + }, + { + "epoch": 1.439369492366886, + "grad_norm": 0.13223114216083978, + "learning_rate": 3.5750131704517987e-06, + "loss": 0.7259, + "step": 2901 + }, + { + "epoch": 1.4398659550701254, + "grad_norm": 0.13545769906603244, + "learning_rate": 3.574130711297416e-06, + "loss": 0.6985, + "step": 2902 + }, + { + "epoch": 1.4403624177733647, + "grad_norm": 0.14269902147683536, + "learning_rate": 3.5732480879793763e-06, + "loss": 0.7081, + "step": 2903 + }, + { + "epoch": 1.4408588804766043, + "grad_norm": 0.12939717891557706, + "learning_rate": 3.572365300632574e-06, + "loss": 0.7087, + "step": 2904 + }, + { + "epoch": 1.4413553431798436, + "grad_norm": 0.13198711044968622, + "learning_rate": 3.5714823493919305e-06, + "loss": 0.6999, + "step": 2905 + }, + { + "epoch": 1.4418518058830831, + "grad_norm": 0.14765136086907135, + "learning_rate": 3.570599234392389e-06, + "loss": 0.7115, + "step": 2906 + }, + { + "epoch": 1.4423482685863225, + "grad_norm": 0.13571957777801233, + "learning_rate": 3.569715955768921e-06, + "loss": 0.6962, + "step": 2907 + }, + { + "epoch": 1.4428447312895618, + "grad_norm": 0.13574730889325312, + "learning_rate": 3.56883251365652e-06, + "loss": 0.7506, + "step": 2908 + }, + { + "epoch": 1.4433411939928014, + "grad_norm": 0.13592878384629192, + "learning_rate": 3.5679489081902073e-06, + "loss": 0.7426, + "step": 2909 + }, + { + "epoch": 1.4438376566960407, + "grad_norm": 0.12847140272775823, + "learning_rate": 3.5670651395050273e-06, + "loss": 0.6982, + "step": 2910 + }, + { + "epoch": 1.4443341193992802, + "grad_norm": 0.13102794138380155, + "learning_rate": 3.5661812077360496e-06, + "loss": 0.7805, + "step": 2911 + }, + { + "epoch": 1.4448305821025196, + "grad_norm": 0.13544839042143206, + "learning_rate": 3.5652971130183696e-06, + "loss": 0.7568, + "step": 2912 + }, + { + "epoch": 1.445327044805759, + "grad_norm": 0.12813227644476155, + "learning_rate": 3.564412855487106e-06, + "loss": 0.7082, + "step": 2913 + }, + { + "epoch": 1.4458235075089985, + "grad_norm": 0.14071968051452866, + "learning_rate": 3.5635284352774035e-06, + "loss": 0.789, + "step": 2914 + }, + { + "epoch": 1.4463199702122378, + "grad_norm": 0.13520452750443218, + "learning_rate": 3.5626438525244335e-06, + "loss": 0.7401, + "step": 2915 + }, + { + "epoch": 1.4468164329154773, + "grad_norm": 0.142978764949799, + "learning_rate": 3.5617591073633877e-06, + "loss": 0.7518, + "step": 2916 + }, + { + "epoch": 1.4473128956187167, + "grad_norm": 0.13847892621534574, + "learning_rate": 3.560874199929487e-06, + "loss": 0.7487, + "step": 2917 + }, + { + "epoch": 1.447809358321956, + "grad_norm": 0.13593385078802972, + "learning_rate": 3.5599891303579747e-06, + "loss": 0.6684, + "step": 2918 + }, + { + "epoch": 1.4483058210251956, + "grad_norm": 0.13672639592332414, + "learning_rate": 3.559103898784119e-06, + "loss": 0.7517, + "step": 2919 + }, + { + "epoch": 1.448802283728435, + "grad_norm": 0.13336311824180638, + "learning_rate": 3.5582185053432137e-06, + "loss": 0.7068, + "step": 2920 + }, + { + "epoch": 1.4492987464316744, + "grad_norm": 0.13281209912714664, + "learning_rate": 3.5573329501705777e-06, + "loss": 0.7424, + "step": 2921 + }, + { + "epoch": 1.4497952091349138, + "grad_norm": 0.13043603194523082, + "learning_rate": 3.556447233401553e-06, + "loss": 0.7195, + "step": 2922 + }, + { + "epoch": 1.450291671838153, + "grad_norm": 0.138625753926843, + "learning_rate": 3.5555613551715072e-06, + "loss": 0.7468, + "step": 2923 + }, + { + "epoch": 1.4507881345413927, + "grad_norm": 0.14020220328471075, + "learning_rate": 3.554675315615833e-06, + "loss": 0.731, + "step": 2924 + }, + { + "epoch": 1.451284597244632, + "grad_norm": 0.13627514864142962, + "learning_rate": 3.5537891148699476e-06, + "loss": 0.7456, + "step": 2925 + }, + { + "epoch": 1.4517810599478715, + "grad_norm": 0.12926303570402706, + "learning_rate": 3.552902753069293e-06, + "loss": 0.7147, + "step": 2926 + }, + { + "epoch": 1.4522775226511109, + "grad_norm": 0.14055188911284178, + "learning_rate": 3.552016230349334e-06, + "loss": 0.7487, + "step": 2927 + }, + { + "epoch": 1.4527739853543502, + "grad_norm": 0.14061422002541768, + "learning_rate": 3.551129546845561e-06, + "loss": 0.7029, + "step": 2928 + }, + { + "epoch": 1.4532704480575895, + "grad_norm": 0.13754407040631086, + "learning_rate": 3.550242702693491e-06, + "loss": 0.7129, + "step": 2929 + }, + { + "epoch": 1.453766910760829, + "grad_norm": 0.1326488941997002, + "learning_rate": 3.549355698028663e-06, + "loss": 0.7334, + "step": 2930 + }, + { + "epoch": 1.4542633734640686, + "grad_norm": 0.14133250168355752, + "learning_rate": 3.5484685329866424e-06, + "loss": 0.7082, + "step": 2931 + }, + { + "epoch": 1.454759836167308, + "grad_norm": 0.13403821729170973, + "learning_rate": 3.547581207703017e-06, + "loss": 0.7646, + "step": 2932 + }, + { + "epoch": 1.4552562988705473, + "grad_norm": 0.13637800149605267, + "learning_rate": 3.5466937223134007e-06, + "loss": 0.7302, + "step": 2933 + }, + { + "epoch": 1.4557527615737866, + "grad_norm": 0.13875702141124502, + "learning_rate": 3.5458060769534317e-06, + "loss": 0.6984, + "step": 2934 + }, + { + "epoch": 1.4562492242770262, + "grad_norm": 0.13654925956282576, + "learning_rate": 3.5449182717587717e-06, + "loss": 0.7609, + "step": 2935 + }, + { + "epoch": 1.4567456869802657, + "grad_norm": 0.13693125562691924, + "learning_rate": 3.5440303068651077e-06, + "loss": 0.7749, + "step": 2936 + }, + { + "epoch": 1.457242149683505, + "grad_norm": 0.13851101958423012, + "learning_rate": 3.5431421824081512e-06, + "loss": 0.7404, + "step": 2937 + }, + { + "epoch": 1.4577386123867444, + "grad_norm": 0.1439809260358722, + "learning_rate": 3.542253898523638e-06, + "loss": 0.7923, + "step": 2938 + }, + { + "epoch": 1.4582350750899837, + "grad_norm": 0.13805419183906129, + "learning_rate": 3.5413654553473274e-06, + "loss": 0.7017, + "step": 2939 + }, + { + "epoch": 1.4587315377932233, + "grad_norm": 0.13835157639154178, + "learning_rate": 3.5404768530150035e-06, + "loss": 0.7551, + "step": 2940 + }, + { + "epoch": 1.4592280004964628, + "grad_norm": 0.13287184038076372, + "learning_rate": 3.539588091662476e-06, + "loss": 0.712, + "step": 2941 + }, + { + "epoch": 1.4597244631997022, + "grad_norm": 0.1452095077228309, + "learning_rate": 3.5386991714255775e-06, + "loss": 0.7842, + "step": 2942 + }, + { + "epoch": 1.4602209259029415, + "grad_norm": 0.1314235216478171, + "learning_rate": 3.537810092440165e-06, + "loss": 0.6679, + "step": 2943 + }, + { + "epoch": 1.4607173886061808, + "grad_norm": 0.13906999072318538, + "learning_rate": 3.536920854842119e-06, + "loss": 0.7524, + "step": 2944 + }, + { + "epoch": 1.4612138513094204, + "grad_norm": 0.13429900107538834, + "learning_rate": 3.5360314587673463e-06, + "loss": 0.7309, + "step": 2945 + }, + { + "epoch": 1.4617103140126597, + "grad_norm": 0.141994678409815, + "learning_rate": 3.5351419043517764e-06, + "loss": 0.7338, + "step": 2946 + }, + { + "epoch": 1.4622067767158993, + "grad_norm": 0.13636727486493982, + "learning_rate": 3.5342521917313643e-06, + "loss": 0.6874, + "step": 2947 + }, + { + "epoch": 1.4627032394191386, + "grad_norm": 0.14177928006977158, + "learning_rate": 3.5333623210420877e-06, + "loss": 0.7831, + "step": 2948 + }, + { + "epoch": 1.463199702122378, + "grad_norm": 0.13586560334697423, + "learning_rate": 3.532472292419949e-06, + "loss": 0.7399, + "step": 2949 + }, + { + "epoch": 1.4636961648256175, + "grad_norm": 0.1395384541698917, + "learning_rate": 3.531582106000975e-06, + "loss": 0.7093, + "step": 2950 + }, + { + "epoch": 1.4641926275288568, + "grad_norm": 0.1302253949626591, + "learning_rate": 3.5306917619212157e-06, + "loss": 0.7005, + "step": 2951 + }, + { + "epoch": 1.4646890902320964, + "grad_norm": 0.13335922133133257, + "learning_rate": 3.5298012603167463e-06, + "loss": 0.7291, + "step": 2952 + }, + { + "epoch": 1.4651855529353357, + "grad_norm": 0.1377977708057461, + "learning_rate": 3.528910601323666e-06, + "loss": 0.7324, + "step": 2953 + }, + { + "epoch": 1.465682015638575, + "grad_norm": 0.13572427845757098, + "learning_rate": 3.5280197850780986e-06, + "loss": 0.723, + "step": 2954 + }, + { + "epoch": 1.4661784783418146, + "grad_norm": 0.13536174132092055, + "learning_rate": 3.527128811716189e-06, + "loss": 0.7521, + "step": 2955 + }, + { + "epoch": 1.466674941045054, + "grad_norm": 0.13617464941765656, + "learning_rate": 3.5262376813741095e-06, + "loss": 0.6934, + "step": 2956 + }, + { + "epoch": 1.4671714037482935, + "grad_norm": 0.12812292562730554, + "learning_rate": 3.525346394188055e-06, + "loss": 0.7028, + "step": 2957 + }, + { + "epoch": 1.4676678664515328, + "grad_norm": 0.13368438695645662, + "learning_rate": 3.524454950294244e-06, + "loss": 0.6785, + "step": 2958 + }, + { + "epoch": 1.4681643291547721, + "grad_norm": 0.14014375660987655, + "learning_rate": 3.523563349828921e-06, + "loss": 0.7574, + "step": 2959 + }, + { + "epoch": 1.4686607918580117, + "grad_norm": 0.13236384889227554, + "learning_rate": 3.5226715929283507e-06, + "loss": 0.7025, + "step": 2960 + }, + { + "epoch": 1.469157254561251, + "grad_norm": 0.13657143100492303, + "learning_rate": 3.521779679728824e-06, + "loss": 0.7412, + "step": 2961 + }, + { + "epoch": 1.4696537172644906, + "grad_norm": 0.14598345092076667, + "learning_rate": 3.5208876103666566e-06, + "loss": 0.7279, + "step": 2962 + }, + { + "epoch": 1.47015017996773, + "grad_norm": 0.13362251541331926, + "learning_rate": 3.519995384978187e-06, + "loss": 0.7256, + "step": 2963 + }, + { + "epoch": 1.4706466426709692, + "grad_norm": 0.1405944370949059, + "learning_rate": 3.5191030036997774e-06, + "loss": 0.7497, + "step": 2964 + }, + { + "epoch": 1.4711431053742088, + "grad_norm": 0.13130473785305005, + "learning_rate": 3.5182104666678136e-06, + "loss": 0.7074, + "step": 2965 + }, + { + "epoch": 1.4716395680774481, + "grad_norm": 0.1382341947573506, + "learning_rate": 3.517317774018706e-06, + "loss": 0.7273, + "step": 2966 + }, + { + "epoch": 1.4721360307806877, + "grad_norm": 0.1398909288694717, + "learning_rate": 3.516424925888887e-06, + "loss": 0.7298, + "step": 2967 + }, + { + "epoch": 1.472632493483927, + "grad_norm": 0.13083337585357746, + "learning_rate": 3.515531922414816e-06, + "loss": 0.6801, + "step": 2968 + }, + { + "epoch": 1.4731289561871663, + "grad_norm": 0.13314911288815096, + "learning_rate": 3.514638763732974e-06, + "loss": 0.6965, + "step": 2969 + }, + { + "epoch": 1.4736254188904059, + "grad_norm": 0.14032556414048164, + "learning_rate": 3.5137454499798646e-06, + "loss": 0.7974, + "step": 2970 + }, + { + "epoch": 1.4741218815936452, + "grad_norm": 0.13813913057905594, + "learning_rate": 3.5128519812920176e-06, + "loss": 0.7583, + "step": 2971 + }, + { + "epoch": 1.4746183442968848, + "grad_norm": 0.13702265237609512, + "learning_rate": 3.5119583578059845e-06, + "loss": 0.703, + "step": 2972 + }, + { + "epoch": 1.475114807000124, + "grad_norm": 0.13982120405929221, + "learning_rate": 3.5110645796583425e-06, + "loss": 0.7253, + "step": 2973 + }, + { + "epoch": 1.4756112697033634, + "grad_norm": 0.1364392260235214, + "learning_rate": 3.5101706469856913e-06, + "loss": 0.748, + "step": 2974 + }, + { + "epoch": 1.476107732406603, + "grad_norm": 0.140027409645103, + "learning_rate": 3.509276559924653e-06, + "loss": 0.7313, + "step": 2975 + }, + { + "epoch": 1.4766041951098423, + "grad_norm": 0.1320770359244831, + "learning_rate": 3.5083823186118748e-06, + "loss": 0.6947, + "step": 2976 + }, + { + "epoch": 1.4771006578130819, + "grad_norm": 0.12822200062993674, + "learning_rate": 3.5074879231840274e-06, + "loss": 0.6715, + "step": 2977 + }, + { + "epoch": 1.4775971205163212, + "grad_norm": 0.13286535524254892, + "learning_rate": 3.5065933737778054e-06, + "loss": 0.7157, + "step": 2978 + }, + { + "epoch": 1.4780935832195605, + "grad_norm": 0.13858890088438325, + "learning_rate": 3.505698670529925e-06, + "loss": 0.7208, + "step": 2979 + }, + { + "epoch": 1.4785900459228, + "grad_norm": 0.13027070434072718, + "learning_rate": 3.5048038135771285e-06, + "loss": 0.7217, + "step": 2980 + }, + { + "epoch": 1.4790865086260394, + "grad_norm": 0.1413449153673528, + "learning_rate": 3.5039088030561798e-06, + "loss": 0.7565, + "step": 2981 + }, + { + "epoch": 1.479582971329279, + "grad_norm": 0.1400056680073501, + "learning_rate": 3.503013639103867e-06, + "loss": 0.7495, + "step": 2982 + }, + { + "epoch": 1.4800794340325183, + "grad_norm": 0.13165330464949065, + "learning_rate": 3.502118321857001e-06, + "loss": 0.7036, + "step": 2983 + }, + { + "epoch": 1.4805758967357576, + "grad_norm": 0.13774638206096473, + "learning_rate": 3.5012228514524177e-06, + "loss": 0.7311, + "step": 2984 + }, + { + "epoch": 1.4810723594389972, + "grad_norm": 0.13581207084716526, + "learning_rate": 3.5003272280269745e-06, + "loss": 0.682, + "step": 2985 + }, + { + "epoch": 1.4815688221422365, + "grad_norm": 0.13429634165776128, + "learning_rate": 3.4994314517175544e-06, + "loss": 0.6948, + "step": 2986 + }, + { + "epoch": 1.482065284845476, + "grad_norm": 0.13400596652450936, + "learning_rate": 3.4985355226610613e-06, + "loss": 0.7803, + "step": 2987 + }, + { + "epoch": 1.4825617475487154, + "grad_norm": 0.13658283943813354, + "learning_rate": 3.4976394409944236e-06, + "loss": 0.6891, + "step": 2988 + }, + { + "epoch": 1.4830582102519547, + "grad_norm": 0.1434351594908228, + "learning_rate": 3.4967432068545933e-06, + "loss": 0.706, + "step": 2989 + }, + { + "epoch": 1.4835546729551943, + "grad_norm": 0.14046517002064254, + "learning_rate": 3.4958468203785454e-06, + "loss": 0.7323, + "step": 2990 + }, + { + "epoch": 1.4840511356584336, + "grad_norm": 0.14214435255887042, + "learning_rate": 3.4949502817032787e-06, + "loss": 0.7598, + "step": 2991 + }, + { + "epoch": 1.4845475983616732, + "grad_norm": 0.13754977790132406, + "learning_rate": 3.4940535909658134e-06, + "loss": 0.7143, + "step": 2992 + }, + { + "epoch": 1.4850440610649125, + "grad_norm": 0.14453237890736265, + "learning_rate": 3.493156748303196e-06, + "loss": 0.7702, + "step": 2993 + }, + { + "epoch": 1.4855405237681518, + "grad_norm": 0.1373786250860733, + "learning_rate": 3.4922597538524925e-06, + "loss": 0.732, + "step": 2994 + }, + { + "epoch": 1.4860369864713914, + "grad_norm": 0.1325135135773225, + "learning_rate": 3.491362607750796e-06, + "loss": 0.6744, + "step": 2995 + }, + { + "epoch": 1.4865334491746307, + "grad_norm": 0.13625614276317374, + "learning_rate": 3.4904653101352204e-06, + "loss": 0.7523, + "step": 2996 + }, + { + "epoch": 1.4870299118778703, + "grad_norm": 0.13518539082417194, + "learning_rate": 3.4895678611429027e-06, + "loss": 0.7093, + "step": 2997 + }, + { + "epoch": 1.4875263745811096, + "grad_norm": 0.13657072146153515, + "learning_rate": 3.4886702609110045e-06, + "loss": 0.717, + "step": 2998 + }, + { + "epoch": 1.488022837284349, + "grad_norm": 0.13163907731177207, + "learning_rate": 3.487772509576709e-06, + "loss": 0.7427, + "step": 2999 + }, + { + "epoch": 1.4885192999875885, + "grad_norm": 0.1352649941460766, + "learning_rate": 3.4868746072772224e-06, + "loss": 0.7275, + "step": 3000 + }, + { + "epoch": 1.4890157626908278, + "grad_norm": 0.13558793748009995, + "learning_rate": 3.4859765541497758e-06, + "loss": 0.7283, + "step": 3001 + }, + { + "epoch": 1.4895122253940674, + "grad_norm": 0.1364033885783727, + "learning_rate": 3.485078350331622e-06, + "loss": 0.7014, + "step": 3002 + }, + { + "epoch": 1.4900086880973067, + "grad_norm": 0.13933690940861296, + "learning_rate": 3.4841799959600364e-06, + "loss": 0.744, + "step": 3003 + }, + { + "epoch": 1.490505150800546, + "grad_norm": 0.14140035289104919, + "learning_rate": 3.4832814911723187e-06, + "loss": 0.7211, + "step": 3004 + }, + { + "epoch": 1.4910016135037856, + "grad_norm": 0.13562997010154254, + "learning_rate": 3.482382836105791e-06, + "loss": 0.7335, + "step": 3005 + }, + { + "epoch": 1.491498076207025, + "grad_norm": 0.13945900902072886, + "learning_rate": 3.481484030897798e-06, + "loss": 0.8175, + "step": 3006 + }, + { + "epoch": 1.4919945389102645, + "grad_norm": 0.13803655176116866, + "learning_rate": 3.4805850756857083e-06, + "loss": 0.7692, + "step": 3007 + }, + { + "epoch": 1.4924910016135038, + "grad_norm": 0.14038571658779134, + "learning_rate": 3.479685970606912e-06, + "loss": 0.7348, + "step": 3008 + }, + { + "epoch": 1.4929874643167431, + "grad_norm": 0.13668694894227656, + "learning_rate": 3.478786715798823e-06, + "loss": 0.7535, + "step": 3009 + }, + { + "epoch": 1.4934839270199827, + "grad_norm": 0.12831253400925277, + "learning_rate": 3.4778873113988776e-06, + "loss": 0.7013, + "step": 3010 + }, + { + "epoch": 1.493980389723222, + "grad_norm": 0.13510790471247616, + "learning_rate": 3.4769877575445366e-06, + "loss": 0.7007, + "step": 3011 + }, + { + "epoch": 1.4944768524264616, + "grad_norm": 0.13836794462260757, + "learning_rate": 3.4760880543732816e-06, + "loss": 0.6941, + "step": 3012 + }, + { + "epoch": 1.4949733151297009, + "grad_norm": 0.13929710909188023, + "learning_rate": 3.4751882020226174e-06, + "loss": 0.7835, + "step": 3013 + }, + { + "epoch": 1.4954697778329402, + "grad_norm": 0.13277890430638015, + "learning_rate": 3.4742882006300734e-06, + "loss": 0.6666, + "step": 3014 + }, + { + "epoch": 1.4959662405361798, + "grad_norm": 0.13390371855032038, + "learning_rate": 3.4733880503331983e-06, + "loss": 0.7052, + "step": 3015 + }, + { + "epoch": 1.496462703239419, + "grad_norm": 0.13663110970590517, + "learning_rate": 3.4724877512695677e-06, + "loss": 0.7179, + "step": 3016 + }, + { + "epoch": 1.4969591659426587, + "grad_norm": 0.13773347718243625, + "learning_rate": 3.4715873035767766e-06, + "loss": 0.7086, + "step": 3017 + }, + { + "epoch": 1.497455628645898, + "grad_norm": 0.1363564450553434, + "learning_rate": 3.4706867073924446e-06, + "loss": 0.7173, + "step": 3018 + }, + { + "epoch": 1.4979520913491373, + "grad_norm": 0.13186824374836328, + "learning_rate": 3.469785962854213e-06, + "loss": 0.7346, + "step": 3019 + }, + { + "epoch": 1.4984485540523769, + "grad_norm": 0.13838548862045663, + "learning_rate": 3.4688850700997455e-06, + "loss": 0.723, + "step": 3020 + }, + { + "epoch": 1.4989450167556162, + "grad_norm": 0.13121107819663974, + "learning_rate": 3.467984029266731e-06, + "loss": 0.7178, + "step": 3021 + }, + { + "epoch": 1.4994414794588558, + "grad_norm": 0.1359428642141061, + "learning_rate": 3.467082840492878e-06, + "loss": 0.731, + "step": 3022 + }, + { + "epoch": 1.499937942162095, + "grad_norm": 0.13804903069761093, + "learning_rate": 3.4661815039159186e-06, + "loss": 0.6986, + "step": 3023 + }, + { + "epoch": 1.5004344048653344, + "grad_norm": 0.12815687067739065, + "learning_rate": 3.465280019673608e-06, + "loss": 0.7029, + "step": 3024 + }, + { + "epoch": 1.5004344048653344, + "eval_loss": 0.7363704442977905, + "eval_runtime": 135.8597, + "eval_samples_per_second": 223.414, + "eval_steps_per_second": 27.933, + "step": 3024 + }, + { + "epoch": 1.5009308675685737, + "grad_norm": 0.1470067433495751, + "learning_rate": 3.4643783879037235e-06, + "loss": 0.7213, + "step": 3025 + }, + { + "epoch": 1.5014273302718133, + "grad_norm": 0.21715999444549708, + "learning_rate": 3.4634766087440645e-06, + "loss": 0.7743, + "step": 3026 + }, + { + "epoch": 1.5019237929750529, + "grad_norm": 0.13601624552016667, + "learning_rate": 3.4625746823324545e-06, + "loss": 0.7205, + "step": 3027 + }, + { + "epoch": 1.5024202556782922, + "grad_norm": 0.1323109309363181, + "learning_rate": 3.4616726088067383e-06, + "loss": 0.7679, + "step": 3028 + }, + { + "epoch": 1.5029167183815315, + "grad_norm": 0.13051111527820897, + "learning_rate": 3.460770388304782e-06, + "loss": 0.7117, + "step": 3029 + }, + { + "epoch": 1.5034131810847708, + "grad_norm": 0.1354078149158313, + "learning_rate": 3.459868020964478e-06, + "loss": 0.7352, + "step": 3030 + }, + { + "epoch": 1.5039096437880104, + "grad_norm": 0.14053113861350988, + "learning_rate": 3.4589655069237367e-06, + "loss": 0.7575, + "step": 3031 + }, + { + "epoch": 1.50440610649125, + "grad_norm": 0.13146810822091834, + "learning_rate": 3.4580628463204936e-06, + "loss": 0.7172, + "step": 3032 + }, + { + "epoch": 1.5049025691944893, + "grad_norm": 0.13001154124097078, + "learning_rate": 3.457160039292705e-06, + "loss": 0.6829, + "step": 3033 + }, + { + "epoch": 1.5053990318977286, + "grad_norm": 0.13574315079485202, + "learning_rate": 3.456257085978352e-06, + "loss": 0.7401, + "step": 3034 + }, + { + "epoch": 1.505895494600968, + "grad_norm": 0.13467407064352518, + "learning_rate": 3.4553539865154362e-06, + "loss": 0.7276, + "step": 3035 + }, + { + "epoch": 1.5063919573042075, + "grad_norm": 0.16132459867477508, + "learning_rate": 3.4544507410419807e-06, + "loss": 0.7567, + "step": 3036 + }, + { + "epoch": 1.506888420007447, + "grad_norm": 0.13133038324487403, + "learning_rate": 3.453547349696033e-06, + "loss": 0.7089, + "step": 3037 + }, + { + "epoch": 1.5073848827106864, + "grad_norm": 0.13434120206254171, + "learning_rate": 3.4526438126156624e-06, + "loss": 0.7183, + "step": 3038 + }, + { + "epoch": 1.5078813454139257, + "grad_norm": 0.133739491071614, + "learning_rate": 3.45174012993896e-06, + "loss": 0.6883, + "step": 3039 + }, + { + "epoch": 1.508377808117165, + "grad_norm": 0.13292833480778746, + "learning_rate": 3.450836301804038e-06, + "loss": 0.7354, + "step": 3040 + }, + { + "epoch": 1.5088742708204046, + "grad_norm": 0.1310771402662206, + "learning_rate": 3.449932328349033e-06, + "loss": 0.7078, + "step": 3041 + }, + { + "epoch": 1.5093707335236441, + "grad_norm": 0.13464111175584303, + "learning_rate": 3.449028209712102e-06, + "loss": 0.6777, + "step": 3042 + }, + { + "epoch": 1.5098671962268835, + "grad_norm": 0.13372159063346198, + "learning_rate": 3.4481239460314252e-06, + "loss": 0.7078, + "step": 3043 + }, + { + "epoch": 1.5103636589301228, + "grad_norm": 0.14328077784905502, + "learning_rate": 3.4472195374452067e-06, + "loss": 0.7541, + "step": 3044 + }, + { + "epoch": 1.5108601216333621, + "grad_norm": 0.14108614619902352, + "learning_rate": 3.446314984091669e-06, + "loss": 0.7537, + "step": 3045 + }, + { + "epoch": 1.5113565843366017, + "grad_norm": 0.1387413397672071, + "learning_rate": 3.445410286109059e-06, + "loss": 0.7313, + "step": 3046 + }, + { + "epoch": 1.5118530470398412, + "grad_norm": 0.1437298602735984, + "learning_rate": 3.444505443635645e-06, + "loss": 0.7198, + "step": 3047 + }, + { + "epoch": 1.5123495097430806, + "grad_norm": 0.14561460770522325, + "learning_rate": 3.4436004568097177e-06, + "loss": 0.7763, + "step": 3048 + }, + { + "epoch": 1.51284597244632, + "grad_norm": 0.1437698294167037, + "learning_rate": 3.4426953257695904e-06, + "loss": 0.7282, + "step": 3049 + }, + { + "epoch": 1.5133424351495592, + "grad_norm": 0.13602577897519388, + "learning_rate": 3.4417900506535977e-06, + "loss": 0.7059, + "step": 3050 + }, + { + "epoch": 1.5138388978527988, + "grad_norm": 0.13627196775475553, + "learning_rate": 3.440884631600096e-06, + "loss": 0.6885, + "step": 3051 + }, + { + "epoch": 1.5143353605560383, + "grad_norm": 0.13016712398288627, + "learning_rate": 3.4399790687474638e-06, + "loss": 0.7099, + "step": 3052 + }, + { + "epoch": 1.5148318232592777, + "grad_norm": 0.1420534555060904, + "learning_rate": 3.4390733622341025e-06, + "loss": 0.7624, + "step": 3053 + }, + { + "epoch": 1.515328285962517, + "grad_norm": 0.13710013215332392, + "learning_rate": 3.438167512198436e-06, + "loss": 0.7549, + "step": 3054 + }, + { + "epoch": 1.5158247486657563, + "grad_norm": 0.14905485277450392, + "learning_rate": 3.437261518778906e-06, + "loss": 0.7153, + "step": 3055 + }, + { + "epoch": 1.516321211368996, + "grad_norm": 0.13714440652108204, + "learning_rate": 3.4363553821139822e-06, + "loss": 0.7061, + "step": 3056 + }, + { + "epoch": 1.5168176740722354, + "grad_norm": 0.1448142948924188, + "learning_rate": 3.4354491023421503e-06, + "loss": 0.7118, + "step": 3057 + }, + { + "epoch": 1.5173141367754748, + "grad_norm": 0.13221928060116708, + "learning_rate": 3.434542679601922e-06, + "loss": 0.6995, + "step": 3058 + }, + { + "epoch": 1.517810599478714, + "grad_norm": 0.1406755984922586, + "learning_rate": 3.4336361140318298e-06, + "loss": 0.7159, + "step": 3059 + }, + { + "epoch": 1.5183070621819534, + "grad_norm": 0.13490595450975446, + "learning_rate": 3.432729405770427e-06, + "loss": 0.7348, + "step": 3060 + }, + { + "epoch": 1.518803524885193, + "grad_norm": 0.1331655447314572, + "learning_rate": 3.4318225549562907e-06, + "loss": 0.6946, + "step": 3061 + }, + { + "epoch": 1.5192999875884325, + "grad_norm": 0.1376254608523503, + "learning_rate": 3.4309155617280164e-06, + "loss": 0.7276, + "step": 3062 + }, + { + "epoch": 1.5197964502916719, + "grad_norm": 0.131419562336429, + "learning_rate": 3.430008426224225e-06, + "loss": 0.7282, + "step": 3063 + }, + { + "epoch": 1.5202929129949112, + "grad_norm": 0.13982511067051417, + "learning_rate": 3.429101148583557e-06, + "loss": 0.7345, + "step": 3064 + }, + { + "epoch": 1.5207893756981505, + "grad_norm": 0.13655012597581628, + "learning_rate": 3.4281937289446753e-06, + "loss": 0.7052, + "step": 3065 + }, + { + "epoch": 1.52128583840139, + "grad_norm": 0.13360080661036083, + "learning_rate": 3.4272861674462653e-06, + "loss": 0.6868, + "step": 3066 + }, + { + "epoch": 1.5217823011046296, + "grad_norm": 0.12748341437271118, + "learning_rate": 3.426378464227032e-06, + "loss": 0.6866, + "step": 3067 + }, + { + "epoch": 1.522278763807869, + "grad_norm": 0.13955206652734053, + "learning_rate": 3.425470619425704e-06, + "loss": 0.723, + "step": 3068 + }, + { + "epoch": 1.5227752265111083, + "grad_norm": 0.1419812477049388, + "learning_rate": 3.42456263318103e-06, + "loss": 0.7752, + "step": 3069 + }, + { + "epoch": 1.5232716892143476, + "grad_norm": 0.1301543894648006, + "learning_rate": 3.4236545056317828e-06, + "loss": 0.7152, + "step": 3070 + }, + { + "epoch": 1.5237681519175872, + "grad_norm": 0.13531435292463873, + "learning_rate": 3.422746236916753e-06, + "loss": 0.7006, + "step": 3071 + }, + { + "epoch": 1.5242646146208267, + "grad_norm": 0.1316850500436868, + "learning_rate": 3.4218378271747566e-06, + "loss": 0.706, + "step": 3072 + }, + { + "epoch": 1.524761077324066, + "grad_norm": 0.13593788426927872, + "learning_rate": 3.420929276544629e-06, + "loss": 0.6914, + "step": 3073 + }, + { + "epoch": 1.5252575400273054, + "grad_norm": 0.13157963484148716, + "learning_rate": 3.420020585165227e-06, + "loss": 0.6814, + "step": 3074 + }, + { + "epoch": 1.5257540027305447, + "grad_norm": 0.13002115412677262, + "learning_rate": 3.4191117531754296e-06, + "loss": 0.7078, + "step": 3075 + }, + { + "epoch": 1.5262504654337843, + "grad_norm": 0.13527130246956548, + "learning_rate": 3.418202780714138e-06, + "loss": 0.7036, + "step": 3076 + }, + { + "epoch": 1.5267469281370238, + "grad_norm": 0.13262041773411234, + "learning_rate": 3.4172936679202745e-06, + "loss": 0.7757, + "step": 3077 + }, + { + "epoch": 1.5272433908402632, + "grad_norm": 0.13085661822705708, + "learning_rate": 3.4163844149327807e-06, + "loss": 0.737, + "step": 3078 + }, + { + "epoch": 1.5277398535435025, + "grad_norm": 0.13709896836317348, + "learning_rate": 3.4154750218906226e-06, + "loss": 0.7503, + "step": 3079 + }, + { + "epoch": 1.5282363162467418, + "grad_norm": 0.15213123399903336, + "learning_rate": 3.414565488932785e-06, + "loss": 0.6988, + "step": 3080 + }, + { + "epoch": 1.5287327789499814, + "grad_norm": 0.13278188283664377, + "learning_rate": 3.4136558161982767e-06, + "loss": 0.6908, + "step": 3081 + }, + { + "epoch": 1.529229241653221, + "grad_norm": 0.1414124845528321, + "learning_rate": 3.4127460038261274e-06, + "loss": 0.7579, + "step": 3082 + }, + { + "epoch": 1.5297257043564603, + "grad_norm": 0.1327347365759449, + "learning_rate": 3.411836051955385e-06, + "loss": 0.7004, + "step": 3083 + }, + { + "epoch": 1.5302221670596996, + "grad_norm": 0.13080333502954117, + "learning_rate": 3.410925960725123e-06, + "loss": 0.6939, + "step": 3084 + }, + { + "epoch": 1.530718629762939, + "grad_norm": 0.13257078672902534, + "learning_rate": 3.4100157302744324e-06, + "loss": 0.6948, + "step": 3085 + }, + { + "epoch": 1.5312150924661785, + "grad_norm": 0.1322459345068561, + "learning_rate": 3.4091053607424295e-06, + "loss": 0.7182, + "step": 3086 + }, + { + "epoch": 1.531711555169418, + "grad_norm": 0.14457430144423755, + "learning_rate": 3.408194852268248e-06, + "loss": 0.7577, + "step": 3087 + }, + { + "epoch": 1.5322080178726574, + "grad_norm": 0.13598763735719102, + "learning_rate": 3.4072842049910458e-06, + "loss": 0.7082, + "step": 3088 + }, + { + "epoch": 1.5327044805758967, + "grad_norm": 0.13563192470184163, + "learning_rate": 3.406373419049999e-06, + "loss": 0.7036, + "step": 3089 + }, + { + "epoch": 1.533200943279136, + "grad_norm": 0.13408648564933517, + "learning_rate": 3.4054624945843083e-06, + "loss": 0.7111, + "step": 3090 + }, + { + "epoch": 1.5336974059823756, + "grad_norm": 0.13291448319089896, + "learning_rate": 3.404551431733193e-06, + "loss": 0.692, + "step": 3091 + }, + { + "epoch": 1.5341938686856151, + "grad_norm": 0.1357471952977748, + "learning_rate": 3.403640230635895e-06, + "loss": 0.7497, + "step": 3092 + }, + { + "epoch": 1.5346903313888545, + "grad_norm": 0.13026474673855643, + "learning_rate": 3.402728891431677e-06, + "loss": 0.7322, + "step": 3093 + }, + { + "epoch": 1.5351867940920938, + "grad_norm": 0.1344691776866103, + "learning_rate": 3.4018174142598215e-06, + "loss": 0.6748, + "step": 3094 + }, + { + "epoch": 1.5356832567953331, + "grad_norm": 0.1275077838340116, + "learning_rate": 3.400905799259634e-06, + "loss": 0.6254, + "step": 3095 + }, + { + "epoch": 1.5361797194985727, + "grad_norm": 0.13405200474821982, + "learning_rate": 3.3999940465704397e-06, + "loss": 0.7132, + "step": 3096 + }, + { + "epoch": 1.5366761822018122, + "grad_norm": 0.137409082156887, + "learning_rate": 3.3990821563315857e-06, + "loss": 0.7572, + "step": 3097 + }, + { + "epoch": 1.5371726449050516, + "grad_norm": 0.13337458486706275, + "learning_rate": 3.3981701286824396e-06, + "loss": 0.7658, + "step": 3098 + }, + { + "epoch": 1.537669107608291, + "grad_norm": 0.14360779977581084, + "learning_rate": 3.3972579637623916e-06, + "loss": 0.7088, + "step": 3099 + }, + { + "epoch": 1.5381655703115302, + "grad_norm": 0.1298485707521375, + "learning_rate": 3.396345661710849e-06, + "loss": 0.6769, + "step": 3100 + }, + { + "epoch": 1.5386620330147698, + "grad_norm": 0.13722224767161476, + "learning_rate": 3.3954332226672444e-06, + "loss": 0.7531, + "step": 3101 + }, + { + "epoch": 1.5391584957180093, + "grad_norm": 0.13819578335401508, + "learning_rate": 3.394520646771029e-06, + "loss": 0.764, + "step": 3102 + }, + { + "epoch": 1.5396549584212487, + "grad_norm": 0.13810759011008472, + "learning_rate": 3.3936079341616746e-06, + "loss": 0.7278, + "step": 3103 + }, + { + "epoch": 1.540151421124488, + "grad_norm": 0.13089937223200337, + "learning_rate": 3.392695084978677e-06, + "loss": 0.7161, + "step": 3104 + }, + { + "epoch": 1.5406478838277273, + "grad_norm": 0.13279609037952766, + "learning_rate": 3.391782099361547e-06, + "loss": 0.7043, + "step": 3105 + }, + { + "epoch": 1.5411443465309669, + "grad_norm": 0.1398313704093573, + "learning_rate": 3.390868977449822e-06, + "loss": 0.7615, + "step": 3106 + }, + { + "epoch": 1.5416408092342064, + "grad_norm": 0.13318449528960383, + "learning_rate": 3.3899557193830585e-06, + "loss": 0.6915, + "step": 3107 + }, + { + "epoch": 1.5421372719374458, + "grad_norm": 0.13626189356645965, + "learning_rate": 3.389042325300832e-06, + "loss": 0.7317, + "step": 3108 + }, + { + "epoch": 1.542633734640685, + "grad_norm": 0.1354355510287649, + "learning_rate": 3.3881287953427423e-06, + "loss": 0.684, + "step": 3109 + }, + { + "epoch": 1.5431301973439244, + "grad_norm": 0.13700779510437183, + "learning_rate": 3.387215129648405e-06, + "loss": 0.7707, + "step": 3110 + }, + { + "epoch": 1.543626660047164, + "grad_norm": 0.13797047616141508, + "learning_rate": 3.386301328357461e-06, + "loss": 0.7168, + "step": 3111 + }, + { + "epoch": 1.5441231227504035, + "grad_norm": 0.14189987866803816, + "learning_rate": 3.3853873916095693e-06, + "loss": 0.6903, + "step": 3112 + }, + { + "epoch": 1.5446195854536429, + "grad_norm": 0.1373537496866223, + "learning_rate": 3.3844733195444108e-06, + "loss": 0.7168, + "step": 3113 + }, + { + "epoch": 1.5451160481568822, + "grad_norm": 0.13354628494592605, + "learning_rate": 3.383559112301687e-06, + "loss": 0.7073, + "step": 3114 + }, + { + "epoch": 1.5456125108601215, + "grad_norm": 0.134865493027683, + "learning_rate": 3.38264477002112e-06, + "loss": 0.7345, + "step": 3115 + }, + { + "epoch": 1.546108973563361, + "grad_norm": 0.13796959049690674, + "learning_rate": 3.3817302928424517e-06, + "loss": 0.7315, + "step": 3116 + }, + { + "epoch": 1.5466054362666006, + "grad_norm": 0.1321638322323823, + "learning_rate": 3.380815680905446e-06, + "loss": 0.6957, + "step": 3117 + }, + { + "epoch": 1.54710189896984, + "grad_norm": 0.13809644723238076, + "learning_rate": 3.3799009343498863e-06, + "loss": 0.7913, + "step": 3118 + }, + { + "epoch": 1.5475983616730793, + "grad_norm": 0.14219819418095606, + "learning_rate": 3.3789860533155764e-06, + "loss": 0.738, + "step": 3119 + }, + { + "epoch": 1.5480948243763186, + "grad_norm": 0.1405366068876724, + "learning_rate": 3.3780710379423425e-06, + "loss": 0.7318, + "step": 3120 + }, + { + "epoch": 1.5485912870795582, + "grad_norm": 0.12933097234296537, + "learning_rate": 3.3771558883700286e-06, + "loss": 0.7257, + "step": 3121 + }, + { + "epoch": 1.5490877497827977, + "grad_norm": 0.13488996517159482, + "learning_rate": 3.3762406047385012e-06, + "loss": 0.6776, + "step": 3122 + }, + { + "epoch": 1.549584212486037, + "grad_norm": 0.1393952179684061, + "learning_rate": 3.3753251871876467e-06, + "loss": 0.7573, + "step": 3123 + }, + { + "epoch": 1.5500806751892764, + "grad_norm": 0.1386055398625771, + "learning_rate": 3.3744096358573724e-06, + "loss": 0.7472, + "step": 3124 + }, + { + "epoch": 1.5505771378925157, + "grad_norm": 0.1360908123346939, + "learning_rate": 3.3734939508876057e-06, + "loss": 0.7527, + "step": 3125 + }, + { + "epoch": 1.5510736005957553, + "grad_norm": 0.13138200688447002, + "learning_rate": 3.3725781324182945e-06, + "loss": 0.6892, + "step": 3126 + }, + { + "epoch": 1.5515700632989948, + "grad_norm": 0.13489013837525454, + "learning_rate": 3.3716621805894056e-06, + "loss": 0.7711, + "step": 3127 + }, + { + "epoch": 1.5520665260022342, + "grad_norm": 0.13712844460423768, + "learning_rate": 3.370746095540928e-06, + "loss": 0.7255, + "step": 3128 + }, + { + "epoch": 1.5525629887054735, + "grad_norm": 0.13677616322194255, + "learning_rate": 3.369829877412871e-06, + "loss": 0.7349, + "step": 3129 + }, + { + "epoch": 1.5530594514087128, + "grad_norm": 0.13423740883277105, + "learning_rate": 3.368913526345265e-06, + "loss": 0.7294, + "step": 3130 + }, + { + "epoch": 1.5535559141119524, + "grad_norm": 0.1328411152620118, + "learning_rate": 3.3679970424781584e-06, + "loss": 0.709, + "step": 3131 + }, + { + "epoch": 1.5540523768151917, + "grad_norm": 0.14012966529834792, + "learning_rate": 3.3670804259516203e-06, + "loss": 0.7004, + "step": 3132 + }, + { + "epoch": 1.5545488395184313, + "grad_norm": 0.13291614080398312, + "learning_rate": 3.366163676905742e-06, + "loss": 0.7144, + "step": 3133 + }, + { + "epoch": 1.5550453022216706, + "grad_norm": 0.13260952957552452, + "learning_rate": 3.365246795480634e-06, + "loss": 0.7659, + "step": 3134 + }, + { + "epoch": 1.55554176492491, + "grad_norm": 0.135098720690184, + "learning_rate": 3.3643297818164263e-06, + "loss": 0.6968, + "step": 3135 + }, + { + "epoch": 1.5560382276281495, + "grad_norm": 0.1337151472755321, + "learning_rate": 3.3634126360532694e-06, + "loss": 0.7579, + "step": 3136 + }, + { + "epoch": 1.5565346903313888, + "grad_norm": 0.13823809792997804, + "learning_rate": 3.362495358331336e-06, + "loss": 0.7366, + "step": 3137 + }, + { + "epoch": 1.5570311530346284, + "grad_norm": 0.13540112581495553, + "learning_rate": 3.3615779487908147e-06, + "loss": 0.778, + "step": 3138 + }, + { + "epoch": 1.5575276157378677, + "grad_norm": 0.12755996566178962, + "learning_rate": 3.3606604075719187e-06, + "loss": 0.6915, + "step": 3139 + }, + { + "epoch": 1.558024078441107, + "grad_norm": 0.131901346948097, + "learning_rate": 3.359742734814879e-06, + "loss": 0.6949, + "step": 3140 + }, + { + "epoch": 1.5585205411443466, + "grad_norm": 0.13117046874293734, + "learning_rate": 3.358824930659948e-06, + "loss": 0.6904, + "step": 3141 + }, + { + "epoch": 1.559017003847586, + "grad_norm": 0.13765163137556422, + "learning_rate": 3.3579069952473964e-06, + "loss": 0.7074, + "step": 3142 + }, + { + "epoch": 1.5595134665508255, + "grad_norm": 0.143697911811937, + "learning_rate": 3.3569889287175155e-06, + "loss": 0.71, + "step": 3143 + }, + { + "epoch": 1.5600099292540648, + "grad_norm": 0.13255298712174185, + "learning_rate": 3.356070731210618e-06, + "loss": 0.7144, + "step": 3144 + }, + { + "epoch": 1.5605063919573041, + "grad_norm": 0.1390079209743311, + "learning_rate": 3.3551524028670348e-06, + "loss": 0.7387, + "step": 3145 + }, + { + "epoch": 1.5610028546605437, + "grad_norm": 0.15837144226586045, + "learning_rate": 3.3542339438271184e-06, + "loss": 0.8018, + "step": 3146 + }, + { + "epoch": 1.561499317363783, + "grad_norm": 0.13346525456733854, + "learning_rate": 3.35331535423124e-06, + "loss": 0.7318, + "step": 3147 + }, + { + "epoch": 1.5619957800670226, + "grad_norm": 0.1303144364718177, + "learning_rate": 3.352396634219792e-06, + "loss": 0.691, + "step": 3148 + }, + { + "epoch": 1.5624922427702619, + "grad_norm": 0.13463555403818014, + "learning_rate": 3.351477783933186e-06, + "loss": 0.7917, + "step": 3149 + }, + { + "epoch": 1.5629887054735012, + "grad_norm": 0.13535091861217688, + "learning_rate": 3.3505588035118517e-06, + "loss": 0.7196, + "step": 3150 + }, + { + "epoch": 1.5634851681767408, + "grad_norm": 0.1263572778973972, + "learning_rate": 3.3496396930962437e-06, + "loss": 0.6855, + "step": 3151 + }, + { + "epoch": 1.56398163087998, + "grad_norm": 0.18384206339467457, + "learning_rate": 3.3487204528268302e-06, + "loss": 0.7055, + "step": 3152 + }, + { + "epoch": 1.5644780935832197, + "grad_norm": 0.14081009970447547, + "learning_rate": 3.347801082844105e-06, + "loss": 0.7067, + "step": 3153 + }, + { + "epoch": 1.564974556286459, + "grad_norm": 0.13263091004679445, + "learning_rate": 3.3468815832885772e-06, + "loss": 0.6618, + "step": 3154 + }, + { + "epoch": 1.5654710189896983, + "grad_norm": 0.1440081195073848, + "learning_rate": 3.3459619543007772e-06, + "loss": 0.7223, + "step": 3155 + }, + { + "epoch": 1.5659674816929379, + "grad_norm": 0.13271016860764132, + "learning_rate": 3.345042196021257e-06, + "loss": 0.7639, + "step": 3156 + }, + { + "epoch": 1.5664639443961772, + "grad_norm": 0.13353122154660912, + "learning_rate": 3.3441223085905873e-06, + "loss": 0.7087, + "step": 3157 + }, + { + "epoch": 1.5669604070994168, + "grad_norm": 0.13011358405310236, + "learning_rate": 3.3432022921493555e-06, + "loss": 0.723, + "step": 3158 + }, + { + "epoch": 1.567456869802656, + "grad_norm": 0.13401305719243067, + "learning_rate": 3.342282146838175e-06, + "loss": 0.6941, + "step": 3159 + }, + { + "epoch": 1.5679533325058954, + "grad_norm": 0.13781349363786358, + "learning_rate": 3.3413618727976718e-06, + "loss": 0.7399, + "step": 3160 + }, + { + "epoch": 1.568449795209135, + "grad_norm": 0.1374012136941711, + "learning_rate": 3.3404414701684966e-06, + "loss": 0.6933, + "step": 3161 + }, + { + "epoch": 1.5689462579123743, + "grad_norm": 0.1387539946595465, + "learning_rate": 3.3395209390913184e-06, + "loss": 0.7569, + "step": 3162 + }, + { + "epoch": 1.5694427206156139, + "grad_norm": 0.13114107541423803, + "learning_rate": 3.338600279706826e-06, + "loss": 0.7204, + "step": 3163 + }, + { + "epoch": 1.5699391833188532, + "grad_norm": 0.14071169984330964, + "learning_rate": 3.3376794921557266e-06, + "loss": 0.7178, + "step": 3164 + }, + { + "epoch": 1.5704356460220925, + "grad_norm": 0.13592040074130213, + "learning_rate": 3.3367585765787476e-06, + "loss": 0.688, + "step": 3165 + }, + { + "epoch": 1.5709321087253318, + "grad_norm": 0.14138854126933367, + "learning_rate": 3.3358375331166364e-06, + "loss": 0.7637, + "step": 3166 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.13989023590057628, + "learning_rate": 3.3349163619101606e-06, + "loss": 0.7293, + "step": 3167 + }, + { + "epoch": 1.571925034131811, + "grad_norm": 0.14193624544675174, + "learning_rate": 3.333995063100105e-06, + "loss": 0.6964, + "step": 3168 + }, + { + "epoch": 1.5724214968350503, + "grad_norm": 0.12940528095135467, + "learning_rate": 3.333073636827277e-06, + "loss": 0.7047, + "step": 3169 + }, + { + "epoch": 1.5729179595382896, + "grad_norm": 0.13154882534410767, + "learning_rate": 3.3321520832325e-06, + "loss": 0.7167, + "step": 3170 + }, + { + "epoch": 1.573414422241529, + "grad_norm": 0.13169987947643613, + "learning_rate": 3.33123040245662e-06, + "loss": 0.6984, + "step": 3171 + }, + { + "epoch": 1.5739108849447685, + "grad_norm": 0.12901406346373048, + "learning_rate": 3.3303085946405006e-06, + "loss": 0.7369, + "step": 3172 + }, + { + "epoch": 1.574407347648008, + "grad_norm": 0.1390711912220589, + "learning_rate": 3.329386659925025e-06, + "loss": 0.7334, + "step": 3173 + }, + { + "epoch": 1.5749038103512474, + "grad_norm": 0.13614690824410902, + "learning_rate": 3.3284645984510977e-06, + "loss": 0.7191, + "step": 3174 + }, + { + "epoch": 1.5754002730544867, + "grad_norm": 0.1354970816221801, + "learning_rate": 3.32754241035964e-06, + "loss": 0.7187, + "step": 3175 + }, + { + "epoch": 1.575896735757726, + "grad_norm": 0.13457360527562395, + "learning_rate": 3.3266200957915927e-06, + "loss": 0.713, + "step": 3176 + }, + { + "epoch": 1.5763931984609656, + "grad_norm": 0.1348370258972551, + "learning_rate": 3.3256976548879183e-06, + "loss": 0.7231, + "step": 3177 + }, + { + "epoch": 1.5768896611642051, + "grad_norm": 0.13666530116217873, + "learning_rate": 3.3247750877895955e-06, + "loss": 0.7021, + "step": 3178 + }, + { + "epoch": 1.5773861238674445, + "grad_norm": 0.13309005504337973, + "learning_rate": 3.3238523946376256e-06, + "loss": 0.6904, + "step": 3179 + }, + { + "epoch": 1.5778825865706838, + "grad_norm": 0.14080349679221404, + "learning_rate": 3.322929575573026e-06, + "loss": 0.7235, + "step": 3180 + }, + { + "epoch": 1.5783790492739231, + "grad_norm": 0.13251312434856302, + "learning_rate": 3.322006630736836e-06, + "loss": 0.701, + "step": 3181 + }, + { + "epoch": 1.5788755119771627, + "grad_norm": 0.13612624607173604, + "learning_rate": 3.321083560270112e-06, + "loss": 0.7359, + "step": 3182 + }, + { + "epoch": 1.5793719746804022, + "grad_norm": 0.13678666091572295, + "learning_rate": 3.3201603643139314e-06, + "loss": 0.7353, + "step": 3183 + }, + { + "epoch": 1.5798684373836416, + "grad_norm": 0.1369846073186433, + "learning_rate": 3.319237043009389e-06, + "loss": 0.7178, + "step": 3184 + }, + { + "epoch": 1.580364900086881, + "grad_norm": 0.13713704995068668, + "learning_rate": 3.3183135964976003e-06, + "loss": 0.7221, + "step": 3185 + }, + { + "epoch": 1.5808613627901202, + "grad_norm": 0.13486305893810832, + "learning_rate": 3.3173900249196986e-06, + "loss": 0.7516, + "step": 3186 + }, + { + "epoch": 1.5813578254933598, + "grad_norm": 0.13246788290751252, + "learning_rate": 3.3164663284168382e-06, + "loss": 0.6897, + "step": 3187 + }, + { + "epoch": 1.5818542881965993, + "grad_norm": 0.1338514865785925, + "learning_rate": 3.3155425071301894e-06, + "loss": 0.7198, + "step": 3188 + }, + { + "epoch": 1.5823507508998387, + "grad_norm": 0.1408007323666459, + "learning_rate": 3.3146185612009453e-06, + "loss": 0.695, + "step": 3189 + }, + { + "epoch": 1.582847213603078, + "grad_norm": 0.131260440618678, + "learning_rate": 3.313694490770316e-06, + "loss": 0.7185, + "step": 3190 + }, + { + "epoch": 1.5833436763063173, + "grad_norm": 0.13685270748076497, + "learning_rate": 3.31277029597953e-06, + "loss": 0.7291, + "step": 3191 + }, + { + "epoch": 1.583840139009557, + "grad_norm": 0.13649871749420334, + "learning_rate": 3.311845976969836e-06, + "loss": 0.7442, + "step": 3192 + }, + { + "epoch": 1.5843366017127964, + "grad_norm": 0.14178510091564836, + "learning_rate": 3.3109215338825008e-06, + "loss": 0.7721, + "step": 3193 + }, + { + "epoch": 1.5848330644160358, + "grad_norm": 0.13415549660130519, + "learning_rate": 3.3099969668588117e-06, + "loss": 0.7347, + "step": 3194 + }, + { + "epoch": 1.585329527119275, + "grad_norm": 0.1391423380776298, + "learning_rate": 3.309072276040074e-06, + "loss": 0.7648, + "step": 3195 + }, + { + "epoch": 1.5858259898225144, + "grad_norm": 0.13047621822203795, + "learning_rate": 3.3081474615676106e-06, + "loss": 0.7229, + "step": 3196 + }, + { + "epoch": 1.586322452525754, + "grad_norm": 0.14349775839749804, + "learning_rate": 3.307222523582766e-06, + "loss": 0.7207, + "step": 3197 + }, + { + "epoch": 1.5868189152289935, + "grad_norm": 0.1333296259536943, + "learning_rate": 3.3062974622269006e-06, + "loss": 0.7344, + "step": 3198 + }, + { + "epoch": 1.5873153779322329, + "grad_norm": 0.1336681573376393, + "learning_rate": 3.3053722776413978e-06, + "loss": 0.6781, + "step": 3199 + }, + { + "epoch": 1.5878118406354722, + "grad_norm": 0.13336081958471946, + "learning_rate": 3.304446969967654e-06, + "loss": 0.6875, + "step": 3200 + }, + { + "epoch": 1.5883083033387115, + "grad_norm": 0.1301759528310653, + "learning_rate": 3.3035215393470896e-06, + "loss": 0.6865, + "step": 3201 + }, + { + "epoch": 1.588804766041951, + "grad_norm": 0.13281805769994326, + "learning_rate": 3.3025959859211416e-06, + "loss": 0.7184, + "step": 3202 + }, + { + "epoch": 1.5893012287451906, + "grad_norm": 0.1350945524620567, + "learning_rate": 3.3016703098312653e-06, + "loss": 0.7637, + "step": 3203 + }, + { + "epoch": 1.58979769144843, + "grad_norm": 0.13780227036057782, + "learning_rate": 3.3007445112189362e-06, + "loss": 0.6973, + "step": 3204 + }, + { + "epoch": 1.5902941541516693, + "grad_norm": 0.1367630069339855, + "learning_rate": 3.2998185902256475e-06, + "loss": 0.724, + "step": 3205 + }, + { + "epoch": 1.5907906168549086, + "grad_norm": 0.13924121360508623, + "learning_rate": 3.298892546992912e-06, + "loss": 0.7636, + "step": 3206 + }, + { + "epoch": 1.5912870795581482, + "grad_norm": 0.13353110373070412, + "learning_rate": 3.29796638166226e-06, + "loss": 0.7253, + "step": 3207 + }, + { + "epoch": 1.5917835422613877, + "grad_norm": 0.13363257815429952, + "learning_rate": 3.29704009437524e-06, + "loss": 0.709, + "step": 3208 + }, + { + "epoch": 1.592280004964627, + "grad_norm": 0.1323634598863784, + "learning_rate": 3.2961136852734215e-06, + "loss": 0.7379, + "step": 3209 + }, + { + "epoch": 1.5927764676678664, + "grad_norm": 0.1397078632401545, + "learning_rate": 3.295187154498391e-06, + "loss": 0.7352, + "step": 3210 + }, + { + "epoch": 1.5932729303711057, + "grad_norm": 0.13431457529442403, + "learning_rate": 3.2942605021917535e-06, + "loss": 0.7324, + "step": 3211 + }, + { + "epoch": 1.5937693930743453, + "grad_norm": 0.1365499136783772, + "learning_rate": 3.2933337284951338e-06, + "loss": 0.7463, + "step": 3212 + }, + { + "epoch": 1.5942658557775848, + "grad_norm": 0.13865059264043936, + "learning_rate": 3.2924068335501734e-06, + "loss": 0.7572, + "step": 3213 + }, + { + "epoch": 1.5947623184808242, + "grad_norm": 0.13306898436599435, + "learning_rate": 3.291479817498534e-06, + "loss": 0.6667, + "step": 3214 + }, + { + "epoch": 1.5952587811840635, + "grad_norm": 0.12929290122854256, + "learning_rate": 3.2905526804818954e-06, + "loss": 0.7137, + "step": 3215 + }, + { + "epoch": 1.5957552438873028, + "grad_norm": 0.1372438954972776, + "learning_rate": 3.2896254226419543e-06, + "loss": 0.7392, + "step": 3216 + }, + { + "epoch": 1.5962517065905424, + "grad_norm": 0.13554630669851628, + "learning_rate": 3.2886980441204287e-06, + "loss": 0.7278, + "step": 3217 + }, + { + "epoch": 1.596748169293782, + "grad_norm": 0.1304930159730209, + "learning_rate": 3.2877705450590525e-06, + "loss": 0.6764, + "step": 3218 + }, + { + "epoch": 1.5972446319970213, + "grad_norm": 0.13425983285516158, + "learning_rate": 3.2868429255995792e-06, + "loss": 0.7731, + "step": 3219 + }, + { + "epoch": 1.5977410947002606, + "grad_norm": 0.13109350786854748, + "learning_rate": 3.2859151858837807e-06, + "loss": 0.7339, + "step": 3220 + }, + { + "epoch": 1.5982375574035, + "grad_norm": 0.13818419724327707, + "learning_rate": 3.2849873260534472e-06, + "loss": 0.7421, + "step": 3221 + }, + { + "epoch": 1.5987340201067395, + "grad_norm": 0.13351674846911626, + "learning_rate": 3.2840593462503878e-06, + "loss": 0.7282, + "step": 3222 + }, + { + "epoch": 1.599230482809979, + "grad_norm": 0.1375585476076764, + "learning_rate": 3.283131246616429e-06, + "loss": 0.6996, + "step": 3223 + }, + { + "epoch": 1.5997269455132184, + "grad_norm": 0.13392787493113698, + "learning_rate": 3.2822030272934154e-06, + "loss": 0.7567, + "step": 3224 + }, + { + "epoch": 1.6002234082164577, + "grad_norm": 0.1319504916436644, + "learning_rate": 3.2812746884232106e-06, + "loss": 0.6982, + "step": 3225 + }, + { + "epoch": 1.600719870919697, + "grad_norm": 0.13878346494318192, + "learning_rate": 3.2803462301476962e-06, + "loss": 0.7176, + "step": 3226 + }, + { + "epoch": 1.6012163336229366, + "grad_norm": 0.13949329751448514, + "learning_rate": 3.2794176526087723e-06, + "loss": 0.7316, + "step": 3227 + }, + { + "epoch": 1.6017127963261761, + "grad_norm": 0.13219100886136206, + "learning_rate": 3.278488955948358e-06, + "loss": 0.6974, + "step": 3228 + }, + { + "epoch": 1.6022092590294155, + "grad_norm": 0.12840250780955081, + "learning_rate": 3.2775601403083886e-06, + "loss": 0.7009, + "step": 3229 + }, + { + "epoch": 1.6027057217326548, + "grad_norm": 0.13490025484599139, + "learning_rate": 3.276631205830818e-06, + "loss": 0.6955, + "step": 3230 + }, + { + "epoch": 1.6032021844358941, + "grad_norm": 0.1433135689367546, + "learning_rate": 3.2757021526576216e-06, + "loss": 0.7346, + "step": 3231 + }, + { + "epoch": 1.6036986471391337, + "grad_norm": 0.1594064448899915, + "learning_rate": 3.2747729809307878e-06, + "loss": 0.7767, + "step": 3232 + }, + { + "epoch": 1.6041951098423732, + "grad_norm": 0.13512124092823508, + "learning_rate": 3.2738436907923267e-06, + "loss": 0.732, + "step": 3233 + }, + { + "epoch": 1.6046915725456126, + "grad_norm": 0.13435293487480612, + "learning_rate": 3.2729142823842645e-06, + "loss": 0.7113, + "step": 3234 + }, + { + "epoch": 1.605188035248852, + "grad_norm": 0.1319079246448511, + "learning_rate": 3.2719847558486474e-06, + "loss": 0.7377, + "step": 3235 + }, + { + "epoch": 1.6056844979520912, + "grad_norm": 0.1279027378230854, + "learning_rate": 3.271055111327538e-06, + "loss": 0.7124, + "step": 3236 + }, + { + "epoch": 1.6061809606553308, + "grad_norm": 0.13463254886388712, + "learning_rate": 3.2701253489630175e-06, + "loss": 0.699, + "step": 3237 + }, + { + "epoch": 1.6066774233585703, + "grad_norm": 0.1366952695028082, + "learning_rate": 3.2691954688971862e-06, + "loss": 0.7035, + "step": 3238 + }, + { + "epoch": 1.6071738860618097, + "grad_norm": 0.13669325319793554, + "learning_rate": 3.2682654712721605e-06, + "loss": 0.7521, + "step": 3239 + }, + { + "epoch": 1.607670348765049, + "grad_norm": 0.13083810989888964, + "learning_rate": 3.2673353562300753e-06, + "loss": 0.7283, + "step": 3240 + }, + { + "epoch": 1.6081668114682883, + "grad_norm": 0.14043885513177584, + "learning_rate": 3.266405123913084e-06, + "loss": 0.7589, + "step": 3241 + }, + { + "epoch": 1.6086632741715279, + "grad_norm": 0.13900658899664461, + "learning_rate": 3.265474774463358e-06, + "loss": 0.7308, + "step": 3242 + }, + { + "epoch": 1.6091597368747674, + "grad_norm": 0.14121156080517872, + "learning_rate": 3.264544308023086e-06, + "loss": 0.7011, + "step": 3243 + }, + { + "epoch": 1.6096561995780068, + "grad_norm": 0.13303533153675115, + "learning_rate": 3.2636137247344756e-06, + "loss": 0.6789, + "step": 3244 + }, + { + "epoch": 1.610152662281246, + "grad_norm": 0.1345546966620587, + "learning_rate": 3.2626830247397507e-06, + "loss": 0.7209, + "step": 3245 + }, + { + "epoch": 1.6106491249844854, + "grad_norm": 0.13372939850048826, + "learning_rate": 3.261752208181154e-06, + "loss": 0.7131, + "step": 3246 + }, + { + "epoch": 1.611145587687725, + "grad_norm": 0.13637214131020378, + "learning_rate": 3.260821275200947e-06, + "loss": 0.7006, + "step": 3247 + }, + { + "epoch": 1.6116420503909645, + "grad_norm": 0.133959325485992, + "learning_rate": 3.2598902259414055e-06, + "loss": 0.71, + "step": 3248 + }, + { + "epoch": 1.6121385130942039, + "grad_norm": 0.1317569091674975, + "learning_rate": 3.258959060544828e-06, + "loss": 0.6934, + "step": 3249 + }, + { + "epoch": 1.6126349757974432, + "grad_norm": 0.13549906434028006, + "learning_rate": 3.258027779153527e-06, + "loss": 0.7516, + "step": 3250 + }, + { + "epoch": 1.6131314385006825, + "grad_norm": 0.13646477270544233, + "learning_rate": 3.2570963819098333e-06, + "loss": 0.7309, + "step": 3251 + }, + { + "epoch": 1.613627901203922, + "grad_norm": 0.13531221765549428, + "learning_rate": 3.2561648689560972e-06, + "loss": 0.7276, + "step": 3252 + }, + { + "epoch": 1.6141243639071616, + "grad_norm": 0.13706437632515092, + "learning_rate": 3.2552332404346847e-06, + "loss": 0.7446, + "step": 3253 + }, + { + "epoch": 1.614620826610401, + "grad_norm": 0.12982692323683104, + "learning_rate": 3.2543014964879814e-06, + "loss": 0.6706, + "step": 3254 + }, + { + "epoch": 1.6151172893136403, + "grad_norm": 0.1417142856105007, + "learning_rate": 3.2533696372583886e-06, + "loss": 0.7691, + "step": 3255 + }, + { + "epoch": 1.6156137520168796, + "grad_norm": 0.1338299722054298, + "learning_rate": 3.2524376628883254e-06, + "loss": 0.7318, + "step": 3256 + }, + { + "epoch": 1.6161102147201192, + "grad_norm": 0.13244784626065542, + "learning_rate": 3.25150557352023e-06, + "loss": 0.6839, + "step": 3257 + }, + { + "epoch": 1.6166066774233587, + "grad_norm": 0.1373652541161793, + "learning_rate": 3.250573369296557e-06, + "loss": 0.7263, + "step": 3258 + }, + { + "epoch": 1.617103140126598, + "grad_norm": 0.13733039620980356, + "learning_rate": 3.249641050359779e-06, + "loss": 0.7403, + "step": 3259 + }, + { + "epoch": 1.6175996028298374, + "grad_norm": 0.14578749260154306, + "learning_rate": 3.248708616852387e-06, + "loss": 0.789, + "step": 3260 + }, + { + "epoch": 1.6180960655330767, + "grad_norm": 0.137192295052107, + "learning_rate": 3.247776068916887e-06, + "loss": 0.7158, + "step": 3261 + }, + { + "epoch": 1.6185925282363163, + "grad_norm": 0.1343518895157817, + "learning_rate": 3.246843406695804e-06, + "loss": 0.6879, + "step": 3262 + }, + { + "epoch": 1.6190889909395558, + "grad_norm": 0.13371425918634516, + "learning_rate": 3.245910630331682e-06, + "loss": 0.7908, + "step": 3263 + }, + { + "epoch": 1.6195854536427952, + "grad_norm": 0.13354797872889265, + "learning_rate": 3.2449777399670788e-06, + "loss": 0.6948, + "step": 3264 + }, + { + "epoch": 1.6200819163460345, + "grad_norm": 0.13151382649489246, + "learning_rate": 3.2440447357445737e-06, + "loss": 0.7567, + "step": 3265 + }, + { + "epoch": 1.6205783790492738, + "grad_norm": 0.13595368534380006, + "learning_rate": 3.243111617806761e-06, + "loss": 0.7589, + "step": 3266 + }, + { + "epoch": 1.6210748417525134, + "grad_norm": 0.1365032790773031, + "learning_rate": 3.2421783862962513e-06, + "loss": 0.7405, + "step": 3267 + }, + { + "epoch": 1.621571304455753, + "grad_norm": 0.13435638512861106, + "learning_rate": 3.2412450413556753e-06, + "loss": 0.7066, + "step": 3268 + }, + { + "epoch": 1.6220677671589923, + "grad_norm": 0.13947202661774769, + "learning_rate": 3.24031158312768e-06, + "loss": 0.7538, + "step": 3269 + }, + { + "epoch": 1.6225642298622316, + "grad_norm": 0.13439183598377127, + "learning_rate": 3.2393780117549294e-06, + "loss": 0.7072, + "step": 3270 + }, + { + "epoch": 1.623060692565471, + "grad_norm": 0.13125548459747396, + "learning_rate": 3.238444327380105e-06, + "loss": 0.7033, + "step": 3271 + }, + { + "epoch": 1.6235571552687105, + "grad_norm": 0.13335877452614855, + "learning_rate": 3.2375105301459046e-06, + "loss": 0.7374, + "step": 3272 + }, + { + "epoch": 1.6240536179719498, + "grad_norm": 0.13797996283169242, + "learning_rate": 3.236576620195045e-06, + "loss": 0.7282, + "step": 3273 + }, + { + "epoch": 1.6245500806751894, + "grad_norm": 0.1346903208785757, + "learning_rate": 3.2356425976702587e-06, + "loss": 0.7005, + "step": 3274 + }, + { + "epoch": 1.6250465433784287, + "grad_norm": 0.13164913588291022, + "learning_rate": 3.2347084627142967e-06, + "loss": 0.712, + "step": 3275 + }, + { + "epoch": 1.625543006081668, + "grad_norm": 0.13395069471794036, + "learning_rate": 3.233774215469927e-06, + "loss": 0.6553, + "step": 3276 + }, + { + "epoch": 1.6260394687849076, + "grad_norm": 0.12946118362950973, + "learning_rate": 3.2328398560799327e-06, + "loss": 0.7068, + "step": 3277 + }, + { + "epoch": 1.626535931488147, + "grad_norm": 0.13651890789151064, + "learning_rate": 3.231905384687117e-06, + "loss": 0.7263, + "step": 3278 + }, + { + "epoch": 1.6270323941913865, + "grad_norm": 0.13320408589478813, + "learning_rate": 3.2309708014342987e-06, + "loss": 0.659, + "step": 3279 + }, + { + "epoch": 1.6275288568946258, + "grad_norm": 0.13232211632916635, + "learning_rate": 3.2300361064643133e-06, + "loss": 0.695, + "step": 3280 + }, + { + "epoch": 1.6280253195978651, + "grad_norm": 0.13567576909886284, + "learning_rate": 3.229101299920014e-06, + "loss": 0.7111, + "step": 3281 + }, + { + "epoch": 1.6285217823011047, + "grad_norm": 0.13518794018443017, + "learning_rate": 3.228166381944272e-06, + "loss": 0.7415, + "step": 3282 + }, + { + "epoch": 1.629018245004344, + "grad_norm": 0.13083493674868937, + "learning_rate": 3.227231352679973e-06, + "loss": 0.7146, + "step": 3283 + }, + { + "epoch": 1.6295147077075836, + "grad_norm": 0.14297032801648918, + "learning_rate": 3.2262962122700224e-06, + "loss": 0.7503, + "step": 3284 + }, + { + "epoch": 1.6300111704108229, + "grad_norm": 0.13219562864852205, + "learning_rate": 3.2253609608573412e-06, + "loss": 0.7098, + "step": 3285 + }, + { + "epoch": 1.6305076331140622, + "grad_norm": 0.1356006220123195, + "learning_rate": 3.2244255985848664e-06, + "loss": 0.7158, + "step": 3286 + }, + { + "epoch": 1.6310040958173018, + "grad_norm": 0.13516073020936664, + "learning_rate": 3.2234901255955554e-06, + "loss": 0.7443, + "step": 3287 + }, + { + "epoch": 1.631500558520541, + "grad_norm": 0.13223315868550364, + "learning_rate": 3.222554542032379e-06, + "loss": 0.7013, + "step": 3288 + }, + { + "epoch": 1.6319970212237807, + "grad_norm": 0.1291224602102518, + "learning_rate": 3.221618848038326e-06, + "loss": 0.725, + "step": 3289 + }, + { + "epoch": 1.63249348392702, + "grad_norm": 0.13959827562469576, + "learning_rate": 3.220683043756402e-06, + "loss": 0.7296, + "step": 3290 + }, + { + "epoch": 1.6329899466302593, + "grad_norm": 0.14159509303723317, + "learning_rate": 3.2197471293296297e-06, + "loss": 0.7486, + "step": 3291 + }, + { + "epoch": 1.6334864093334989, + "grad_norm": 0.13293711515989248, + "learning_rate": 3.2188111049010496e-06, + "loss": 0.7258, + "step": 3292 + }, + { + "epoch": 1.6339828720367382, + "grad_norm": 0.1399399476886841, + "learning_rate": 3.217874970613717e-06, + "loss": 0.7762, + "step": 3293 + }, + { + "epoch": 1.6344793347399778, + "grad_norm": 0.13066912497887148, + "learning_rate": 3.216938726610705e-06, + "loss": 0.682, + "step": 3294 + }, + { + "epoch": 1.634975797443217, + "grad_norm": 0.13206917222232278, + "learning_rate": 3.2160023730351047e-06, + "loss": 0.6876, + "step": 3295 + }, + { + "epoch": 1.6354722601464564, + "grad_norm": 0.12881195914477797, + "learning_rate": 3.2150659100300215e-06, + "loss": 0.7009, + "step": 3296 + }, + { + "epoch": 1.635968722849696, + "grad_norm": 0.1354978102327594, + "learning_rate": 3.214129337738579e-06, + "loss": 0.7216, + "step": 3297 + }, + { + "epoch": 1.6364651855529353, + "grad_norm": 0.13272278700868193, + "learning_rate": 3.2131926563039184e-06, + "loss": 0.7671, + "step": 3298 + }, + { + "epoch": 1.6369616482561749, + "grad_norm": 0.13268702786871522, + "learning_rate": 3.2122558658691943e-06, + "loss": 0.7306, + "step": 3299 + }, + { + "epoch": 1.6374581109594142, + "grad_norm": 0.14693293257842704, + "learning_rate": 3.2113189665775812e-06, + "loss": 0.7262, + "step": 3300 + }, + { + "epoch": 1.6379545736626535, + "grad_norm": 0.13647387480975237, + "learning_rate": 3.2103819585722696e-06, + "loss": 0.7228, + "step": 3301 + }, + { + "epoch": 1.638451036365893, + "grad_norm": 0.13489368002148763, + "learning_rate": 3.2094448419964657e-06, + "loss": 0.72, + "step": 3302 + }, + { + "epoch": 1.6389474990691324, + "grad_norm": 0.1334332663332396, + "learning_rate": 3.208507616993393e-06, + "loss": 0.6933, + "step": 3303 + }, + { + "epoch": 1.639443961772372, + "grad_norm": 0.13159985839475669, + "learning_rate": 3.2075702837062915e-06, + "loss": 0.6915, + "step": 3304 + }, + { + "epoch": 1.6399404244756113, + "grad_norm": 0.13257761470120857, + "learning_rate": 3.2066328422784166e-06, + "loss": 0.733, + "step": 3305 + }, + { + "epoch": 1.6404368871788506, + "grad_norm": 0.13649241457619013, + "learning_rate": 3.2056952928530417e-06, + "loss": 0.7765, + "step": 3306 + }, + { + "epoch": 1.64093334988209, + "grad_norm": 0.12816321098365577, + "learning_rate": 3.204757635573456e-06, + "loss": 0.6676, + "step": 3307 + }, + { + "epoch": 1.6414298125853295, + "grad_norm": 0.1357189860674346, + "learning_rate": 3.2038198705829658e-06, + "loss": 0.7106, + "step": 3308 + }, + { + "epoch": 1.641926275288569, + "grad_norm": 0.12905046251866562, + "learning_rate": 3.202881998024894e-06, + "loss": 0.6678, + "step": 3309 + }, + { + "epoch": 1.6424227379918084, + "grad_norm": 0.13424725400492285, + "learning_rate": 3.2019440180425774e-06, + "loss": 0.6833, + "step": 3310 + }, + { + "epoch": 1.6429192006950477, + "grad_norm": 0.1310598204136392, + "learning_rate": 3.201005930779374e-06, + "loss": 0.7341, + "step": 3311 + }, + { + "epoch": 1.643415663398287, + "grad_norm": 0.14192929179095823, + "learning_rate": 3.2000677363786525e-06, + "loss": 0.6982, + "step": 3312 + }, + { + "epoch": 1.6439121261015266, + "grad_norm": 0.12782211475807093, + "learning_rate": 3.199129434983803e-06, + "loss": 0.6782, + "step": 3313 + }, + { + "epoch": 1.6444085888047661, + "grad_norm": 0.13517571601842548, + "learning_rate": 3.1981910267382294e-06, + "loss": 0.7526, + "step": 3314 + }, + { + "epoch": 1.6449050515080055, + "grad_norm": 0.13283442693929196, + "learning_rate": 3.197252511785351e-06, + "loss": 0.7385, + "step": 3315 + }, + { + "epoch": 1.6454015142112448, + "grad_norm": 0.129952438407659, + "learning_rate": 3.196313890268606e-06, + "loss": 0.6833, + "step": 3316 + }, + { + "epoch": 1.6458979769144841, + "grad_norm": 0.13333153615133936, + "learning_rate": 3.1953751623314476e-06, + "loss": 0.7165, + "step": 3317 + }, + { + "epoch": 1.6463944396177237, + "grad_norm": 0.139666486016696, + "learning_rate": 3.1944363281173445e-06, + "loss": 0.7806, + "step": 3318 + }, + { + "epoch": 1.6468909023209632, + "grad_norm": 0.13280138772226624, + "learning_rate": 3.1934973877697843e-06, + "loss": 0.7155, + "step": 3319 + }, + { + "epoch": 1.6473873650242026, + "grad_norm": 0.13553005489923173, + "learning_rate": 3.1925583414322677e-06, + "loss": 0.7518, + "step": 3320 + }, + { + "epoch": 1.647883827727442, + "grad_norm": 0.13507802161305021, + "learning_rate": 3.191619189248312e-06, + "loss": 0.7639, + "step": 3321 + }, + { + "epoch": 1.6483802904306812, + "grad_norm": 0.13564619513742512, + "learning_rate": 3.1906799313614526e-06, + "loss": 0.7453, + "step": 3322 + }, + { + "epoch": 1.6488767531339208, + "grad_norm": 0.13874442942300055, + "learning_rate": 3.1897405679152403e-06, + "loss": 0.6741, + "step": 3323 + }, + { + "epoch": 1.6493732158371603, + "grad_norm": 0.14111170898428813, + "learning_rate": 3.1888010990532412e-06, + "loss": 0.7146, + "step": 3324 + }, + { + "epoch": 1.6498696785403997, + "grad_norm": 0.1284924018943679, + "learning_rate": 3.1878615249190386e-06, + "loss": 0.6811, + "step": 3325 + }, + { + "epoch": 1.650366141243639, + "grad_norm": 0.14995165389634305, + "learning_rate": 3.18692184565623e-06, + "loss": 0.6953, + "step": 3326 + }, + { + "epoch": 1.6508626039468783, + "grad_norm": 0.13386502577631518, + "learning_rate": 3.1859820614084326e-06, + "loss": 0.7294, + "step": 3327 + }, + { + "epoch": 1.651359066650118, + "grad_norm": 0.13753075751861904, + "learning_rate": 3.1850421723192748e-06, + "loss": 0.7097, + "step": 3328 + }, + { + "epoch": 1.6518555293533574, + "grad_norm": 0.1372523640506848, + "learning_rate": 3.1841021785324057e-06, + "loss": 0.7533, + "step": 3329 + }, + { + "epoch": 1.6523519920565968, + "grad_norm": 0.13132741612801554, + "learning_rate": 3.183162080191487e-06, + "loss": 0.7468, + "step": 3330 + }, + { + "epoch": 1.652848454759836, + "grad_norm": 0.14093059894911422, + "learning_rate": 3.1822218774401974e-06, + "loss": 0.7054, + "step": 3331 + }, + { + "epoch": 1.6533449174630754, + "grad_norm": 0.13503393548333384, + "learning_rate": 3.1812815704222326e-06, + "loss": 0.7092, + "step": 3332 + }, + { + "epoch": 1.653841380166315, + "grad_norm": 0.13795156477927883, + "learning_rate": 3.1803411592813036e-06, + "loss": 0.7342, + "step": 3333 + }, + { + "epoch": 1.6543378428695545, + "grad_norm": 0.14715484138992083, + "learning_rate": 3.179400644161137e-06, + "loss": 0.7639, + "step": 3334 + }, + { + "epoch": 1.6548343055727939, + "grad_norm": 0.13505583893408066, + "learning_rate": 3.1784600252054752e-06, + "loss": 0.7639, + "step": 3335 + }, + { + "epoch": 1.6553307682760332, + "grad_norm": 0.13398674733013766, + "learning_rate": 3.1775193025580774e-06, + "loss": 0.7302, + "step": 3336 + }, + { + "epoch": 1.6558272309792725, + "grad_norm": 0.13122340668034163, + "learning_rate": 3.176578476362716e-06, + "loss": 0.6994, + "step": 3337 + }, + { + "epoch": 1.656323693682512, + "grad_norm": 0.1392905054739376, + "learning_rate": 3.1756375467631832e-06, + "loss": 0.743, + "step": 3338 + }, + { + "epoch": 1.6568201563857516, + "grad_norm": 0.13237002846348273, + "learning_rate": 3.1746965139032846e-06, + "loss": 0.7088, + "step": 3339 + }, + { + "epoch": 1.657316619088991, + "grad_norm": 0.1342680889751068, + "learning_rate": 3.1737553779268415e-06, + "loss": 0.742, + "step": 3340 + }, + { + "epoch": 1.6578130817922303, + "grad_norm": 0.13437620364569894, + "learning_rate": 3.172814138977692e-06, + "loss": 0.7017, + "step": 3341 + }, + { + "epoch": 1.6583095444954696, + "grad_norm": 0.13152990564641498, + "learning_rate": 3.1718727971996893e-06, + "loss": 0.6658, + "step": 3342 + }, + { + "epoch": 1.6588060071987092, + "grad_norm": 0.13578569658737435, + "learning_rate": 3.1709313527367025e-06, + "loss": 0.7434, + "step": 3343 + }, + { + "epoch": 1.6593024699019487, + "grad_norm": 0.13282327497087398, + "learning_rate": 3.1699898057326156e-06, + "loss": 0.684, + "step": 3344 + }, + { + "epoch": 1.659798932605188, + "grad_norm": 0.1266967797135504, + "learning_rate": 3.169048156331329e-06, + "loss": 0.7131, + "step": 3345 + }, + { + "epoch": 1.6602953953084274, + "grad_norm": 0.14302577618653753, + "learning_rate": 3.168106404676759e-06, + "loss": 0.7456, + "step": 3346 + }, + { + "epoch": 1.6607918580116667, + "grad_norm": 0.1366996921041571, + "learning_rate": 3.167164550912838e-06, + "loss": 0.7763, + "step": 3347 + }, + { + "epoch": 1.6612883207149063, + "grad_norm": 0.13073297302259157, + "learning_rate": 3.1662225951835124e-06, + "loss": 0.7044, + "step": 3348 + }, + { + "epoch": 1.6617847834181458, + "grad_norm": 0.12656041564587794, + "learning_rate": 3.1652805376327446e-06, + "loss": 0.6735, + "step": 3349 + }, + { + "epoch": 1.6622812461213852, + "grad_norm": 0.13902637524276387, + "learning_rate": 3.164338378404514e-06, + "loss": 0.7789, + "step": 3350 + }, + { + "epoch": 1.6627777088246245, + "grad_norm": 0.13433714640026365, + "learning_rate": 3.163396117642814e-06, + "loss": 0.7237, + "step": 3351 + }, + { + "epoch": 1.6632741715278638, + "grad_norm": 0.13306259899090125, + "learning_rate": 3.162453755491655e-06, + "loss": 0.6549, + "step": 3352 + }, + { + "epoch": 1.6637706342311034, + "grad_norm": 0.14000124931833177, + "learning_rate": 3.1615112920950604e-06, + "loss": 0.7011, + "step": 3353 + }, + { + "epoch": 1.664267096934343, + "grad_norm": 0.13277684994304378, + "learning_rate": 3.160568727597071e-06, + "loss": 0.7056, + "step": 3354 + }, + { + "epoch": 1.6647635596375823, + "grad_norm": 0.13574274974818243, + "learning_rate": 3.159626062141743e-06, + "loss": 0.6961, + "step": 3355 + }, + { + "epoch": 1.6652600223408216, + "grad_norm": 0.13515486910318286, + "learning_rate": 3.158683295873148e-06, + "loss": 0.7505, + "step": 3356 + }, + { + "epoch": 1.665756485044061, + "grad_norm": 0.13117466652194615, + "learning_rate": 3.157740428935373e-06, + "loss": 0.6759, + "step": 3357 + }, + { + "epoch": 1.6662529477473005, + "grad_norm": 0.13734801985031647, + "learning_rate": 3.156797461472518e-06, + "loss": 0.7327, + "step": 3358 + }, + { + "epoch": 1.66674941045054, + "grad_norm": 0.12921730687145758, + "learning_rate": 3.155854393628704e-06, + "loss": 0.7142, + "step": 3359 + }, + { + "epoch": 1.6672458731537794, + "grad_norm": 0.13410715617644764, + "learning_rate": 3.15491122554806e-06, + "loss": 0.6928, + "step": 3360 + }, + { + "epoch": 1.6677423358570187, + "grad_norm": 0.13115756337495957, + "learning_rate": 3.1539679573747366e-06, + "loss": 0.7292, + "step": 3361 + }, + { + "epoch": 1.668238798560258, + "grad_norm": 0.13270364362867004, + "learning_rate": 3.1530245892528964e-06, + "loss": 0.6981, + "step": 3362 + }, + { + "epoch": 1.6687352612634976, + "grad_norm": 0.1312122707358505, + "learning_rate": 3.1520811213267187e-06, + "loss": 0.7039, + "step": 3363 + }, + { + "epoch": 1.6692317239667371, + "grad_norm": 0.13612808788926317, + "learning_rate": 3.151137553740396e-06, + "loss": 0.7128, + "step": 3364 + }, + { + "epoch": 1.6697281866699765, + "grad_norm": 0.1356721052844853, + "learning_rate": 3.150193886638139e-06, + "loss": 0.7021, + "step": 3365 + }, + { + "epoch": 1.6702246493732158, + "grad_norm": 0.1334498879831998, + "learning_rate": 3.149250120164171e-06, + "loss": 0.6959, + "step": 3366 + }, + { + "epoch": 1.6707211120764551, + "grad_norm": 0.12993420392929775, + "learning_rate": 3.148306254462733e-06, + "loss": 0.7035, + "step": 3367 + }, + { + "epoch": 1.6712175747796947, + "grad_norm": 0.12953639610818973, + "learning_rate": 3.1473622896780787e-06, + "loss": 0.6884, + "step": 3368 + }, + { + "epoch": 1.6717140374829342, + "grad_norm": 0.13434437759235057, + "learning_rate": 3.1464182259544774e-06, + "loss": 0.7411, + "step": 3369 + }, + { + "epoch": 1.6722105001861736, + "grad_norm": 0.1351221908451605, + "learning_rate": 3.1454740634362146e-06, + "loss": 0.7518, + "step": 3370 + }, + { + "epoch": 1.672706962889413, + "grad_norm": 0.13151338326848652, + "learning_rate": 3.144529802267591e-06, + "loss": 0.7119, + "step": 3371 + }, + { + "epoch": 1.6732034255926522, + "grad_norm": 0.1330736698831485, + "learning_rate": 3.1435854425929207e-06, + "loss": 0.7364, + "step": 3372 + }, + { + "epoch": 1.6736998882958918, + "grad_norm": 0.13120786063832604, + "learning_rate": 3.142640984556536e-06, + "loss": 0.6817, + "step": 3373 + }, + { + "epoch": 1.6741963509991313, + "grad_norm": 0.13199790092665806, + "learning_rate": 3.1416964283027796e-06, + "loss": 0.6751, + "step": 3374 + }, + { + "epoch": 1.6746928137023707, + "grad_norm": 0.13247768384510822, + "learning_rate": 3.1407517739760135e-06, + "loss": 0.7246, + "step": 3375 + }, + { + "epoch": 1.67518927640561, + "grad_norm": 0.15212250506043967, + "learning_rate": 3.139807021720613e-06, + "loss": 0.7784, + "step": 3376 + }, + { + "epoch": 1.6756857391088493, + "grad_norm": 0.13118756683065375, + "learning_rate": 3.1388621716809664e-06, + "loss": 0.6729, + "step": 3377 + }, + { + "epoch": 1.6761822018120889, + "grad_norm": 0.1288392412087169, + "learning_rate": 3.137917224001481e-06, + "loss": 0.6989, + "step": 3378 + }, + { + "epoch": 1.6766786645153284, + "grad_norm": 0.1315769406124471, + "learning_rate": 3.136972178826576e-06, + "loss": 0.6975, + "step": 3379 + }, + { + "epoch": 1.6771751272185678, + "grad_norm": 0.13147429747221198, + "learning_rate": 3.136027036300687e-06, + "loss": 0.733, + "step": 3380 + }, + { + "epoch": 1.677671589921807, + "grad_norm": 0.12989489332738152, + "learning_rate": 3.1350817965682624e-06, + "loss": 0.7395, + "step": 3381 + }, + { + "epoch": 1.6781680526250464, + "grad_norm": 0.1352912050224614, + "learning_rate": 3.1341364597737684e-06, + "loss": 0.7523, + "step": 3382 + }, + { + "epoch": 1.678664515328286, + "grad_norm": 0.13840497131463406, + "learning_rate": 3.1331910260616845e-06, + "loss": 0.7422, + "step": 3383 + }, + { + "epoch": 1.6791609780315255, + "grad_norm": 0.132998592069353, + "learning_rate": 3.132245495576505e-06, + "loss": 0.7312, + "step": 3384 + }, + { + "epoch": 1.6796574407347649, + "grad_norm": 0.12905667482053407, + "learning_rate": 3.1312998684627383e-06, + "loss": 0.721, + "step": 3385 + }, + { + "epoch": 1.6801539034380042, + "grad_norm": 0.1399236151235144, + "learning_rate": 3.130354144864909e-06, + "loss": 0.733, + "step": 3386 + }, + { + "epoch": 1.6806503661412435, + "grad_norm": 0.13157662254402422, + "learning_rate": 3.1294083249275546e-06, + "loss": 0.727, + "step": 3387 + }, + { + "epoch": 1.681146828844483, + "grad_norm": 0.13230069517151366, + "learning_rate": 3.1284624087952307e-06, + "loss": 0.7103, + "step": 3388 + }, + { + "epoch": 1.6816432915477226, + "grad_norm": 0.134244822451022, + "learning_rate": 3.1275163966125042e-06, + "loss": 0.7265, + "step": 3389 + }, + { + "epoch": 1.682139754250962, + "grad_norm": 0.134846550069988, + "learning_rate": 3.1265702885239573e-06, + "loss": 0.6948, + "step": 3390 + }, + { + "epoch": 1.6826362169542013, + "grad_norm": 0.1312183196880194, + "learning_rate": 3.125624084674188e-06, + "loss": 0.7065, + "step": 3391 + }, + { + "epoch": 1.6831326796574406, + "grad_norm": 0.13279036282716666, + "learning_rate": 3.12467778520781e-06, + "loss": 0.7109, + "step": 3392 + }, + { + "epoch": 1.6836291423606802, + "grad_norm": 0.13146686432062818, + "learning_rate": 3.1237313902694467e-06, + "loss": 0.6808, + "step": 3393 + }, + { + "epoch": 1.6841256050639197, + "grad_norm": 0.13576130928931848, + "learning_rate": 3.122784900003742e-06, + "loss": 0.7479, + "step": 3394 + }, + { + "epoch": 1.684622067767159, + "grad_norm": 0.1337831193034485, + "learning_rate": 3.121838314555351e-06, + "loss": 0.8127, + "step": 3395 + }, + { + "epoch": 1.6851185304703984, + "grad_norm": 0.1388930824488273, + "learning_rate": 3.120891634068944e-06, + "loss": 0.7343, + "step": 3396 + }, + { + "epoch": 1.6856149931736377, + "grad_norm": 0.13614435991184165, + "learning_rate": 3.1199448586892056e-06, + "loss": 0.7719, + "step": 3397 + }, + { + "epoch": 1.6861114558768773, + "grad_norm": 0.1361801580234787, + "learning_rate": 3.1189979885608358e-06, + "loss": 0.7374, + "step": 3398 + }, + { + "epoch": 1.6866079185801168, + "grad_norm": 0.13142547998982623, + "learning_rate": 3.118051023828548e-06, + "loss": 0.738, + "step": 3399 + }, + { + "epoch": 1.6871043812833562, + "grad_norm": 0.1357277571483877, + "learning_rate": 3.1171039646370714e-06, + "loss": 0.7191, + "step": 3400 + }, + { + "epoch": 1.6876008439865955, + "grad_norm": 0.12800485874343553, + "learning_rate": 3.1161568111311487e-06, + "loss": 0.7144, + "step": 3401 + }, + { + "epoch": 1.6880973066898348, + "grad_norm": 0.12514438054459623, + "learning_rate": 3.115209563455536e-06, + "loss": 0.6717, + "step": 3402 + }, + { + "epoch": 1.6885937693930744, + "grad_norm": 0.13334147860764942, + "learning_rate": 3.1142622217550054e-06, + "loss": 0.7519, + "step": 3403 + }, + { + "epoch": 1.689090232096314, + "grad_norm": 0.13393223913614705, + "learning_rate": 3.1133147861743435e-06, + "loss": 0.6909, + "step": 3404 + }, + { + "epoch": 1.6895866947995533, + "grad_norm": 0.1430945896299099, + "learning_rate": 3.112367256858351e-06, + "loss": 0.7288, + "step": 3405 + }, + { + "epoch": 1.6900831575027926, + "grad_norm": 0.13874258590546748, + "learning_rate": 3.111419633951841e-06, + "loss": 0.7006, + "step": 3406 + }, + { + "epoch": 1.690579620206032, + "grad_norm": 0.12943087282029672, + "learning_rate": 3.110471917599643e-06, + "loss": 0.6984, + "step": 3407 + }, + { + "epoch": 1.6910760829092715, + "grad_norm": 0.12968650358300085, + "learning_rate": 3.109524107946602e-06, + "loss": 0.6686, + "step": 3408 + }, + { + "epoch": 1.691572545612511, + "grad_norm": 0.1273962713358661, + "learning_rate": 3.1085762051375727e-06, + "loss": 0.6858, + "step": 3409 + }, + { + "epoch": 1.6920690083157504, + "grad_norm": 0.13601298863571853, + "learning_rate": 3.107628209317429e-06, + "loss": 0.7531, + "step": 3410 + }, + { + "epoch": 1.6925654710189897, + "grad_norm": 0.13179660551493383, + "learning_rate": 3.1066801206310558e-06, + "loss": 0.7169, + "step": 3411 + }, + { + "epoch": 1.693061933722229, + "grad_norm": 0.1262797410728671, + "learning_rate": 3.105731939223354e-06, + "loss": 0.6924, + "step": 3412 + }, + { + "epoch": 1.6935583964254686, + "grad_norm": 0.12881341815960923, + "learning_rate": 3.1047836652392367e-06, + "loss": 0.704, + "step": 3413 + }, + { + "epoch": 1.694054859128708, + "grad_norm": 0.13333244805830557, + "learning_rate": 3.103835298823633e-06, + "loss": 0.7076, + "step": 3414 + }, + { + "epoch": 1.6945513218319475, + "grad_norm": 0.13343352452172727, + "learning_rate": 3.1028868401214862e-06, + "loss": 0.7251, + "step": 3415 + }, + { + "epoch": 1.6950477845351868, + "grad_norm": 0.1310649697161129, + "learning_rate": 3.101938289277753e-06, + "loss": 0.7048, + "step": 3416 + }, + { + "epoch": 1.6955442472384261, + "grad_norm": 0.13399637642806747, + "learning_rate": 3.1009896464374027e-06, + "loss": 0.7047, + "step": 3417 + }, + { + "epoch": 1.6960407099416657, + "grad_norm": 0.14811556928768213, + "learning_rate": 3.100040911745421e-06, + "loss": 0.7407, + "step": 3418 + }, + { + "epoch": 1.696537172644905, + "grad_norm": 0.13730816941238383, + "learning_rate": 3.0990920853468076e-06, + "loss": 0.7355, + "step": 3419 + }, + { + "epoch": 1.6970336353481446, + "grad_norm": 0.13592416626522716, + "learning_rate": 3.098143167386574e-06, + "loss": 0.672, + "step": 3420 + }, + { + "epoch": 1.6975300980513839, + "grad_norm": 0.13560020957046776, + "learning_rate": 3.0971941580097476e-06, + "loss": 0.7311, + "step": 3421 + }, + { + "epoch": 1.6980265607546232, + "grad_norm": 0.12963432619444307, + "learning_rate": 3.0962450573613705e-06, + "loss": 0.7721, + "step": 3422 + }, + { + "epoch": 1.6985230234578628, + "grad_norm": 0.13067867842531036, + "learning_rate": 3.0952958655864957e-06, + "loss": 0.7049, + "step": 3423 + }, + { + "epoch": 1.699019486161102, + "grad_norm": 0.13236415877160046, + "learning_rate": 3.0943465828301935e-06, + "loss": 0.7503, + "step": 3424 + }, + { + "epoch": 1.6995159488643417, + "grad_norm": 0.1347895104957468, + "learning_rate": 3.0933972092375447e-06, + "loss": 0.7277, + "step": 3425 + }, + { + "epoch": 1.700012411567581, + "grad_norm": 0.13020223927968744, + "learning_rate": 3.0924477449536467e-06, + "loss": 0.6961, + "step": 3426 + }, + { + "epoch": 1.7005088742708203, + "grad_norm": 0.12869798181832637, + "learning_rate": 3.0914981901236113e-06, + "loss": 0.7275, + "step": 3427 + }, + { + "epoch": 1.7010053369740599, + "grad_norm": 0.1285881035433599, + "learning_rate": 3.0905485448925603e-06, + "loss": 0.6905, + "step": 3428 + }, + { + "epoch": 1.7015017996772992, + "grad_norm": 0.13590269430618224, + "learning_rate": 3.0895988094056333e-06, + "loss": 0.7392, + "step": 3429 + }, + { + "epoch": 1.7019982623805388, + "grad_norm": 0.1321536527714624, + "learning_rate": 3.088648983807982e-06, + "loss": 0.7341, + "step": 3430 + }, + { + "epoch": 1.702494725083778, + "grad_norm": 0.13823633479008848, + "learning_rate": 3.087699068244771e-06, + "loss": 0.7409, + "step": 3431 + }, + { + "epoch": 1.7029911877870174, + "grad_norm": 0.1342374430839832, + "learning_rate": 3.0867490628611817e-06, + "loss": 0.7252, + "step": 3432 + }, + { + "epoch": 1.703487650490257, + "grad_norm": 0.13031168815094007, + "learning_rate": 3.0857989678024057e-06, + "loss": 0.7556, + "step": 3433 + }, + { + "epoch": 1.7039841131934963, + "grad_norm": 0.132695870959402, + "learning_rate": 3.08484878321365e-06, + "loss": 0.7524, + "step": 3434 + }, + { + "epoch": 1.7044805758967358, + "grad_norm": 0.13160792019629425, + "learning_rate": 3.083898509240134e-06, + "loss": 0.6997, + "step": 3435 + }, + { + "epoch": 1.7049770385999752, + "grad_norm": 0.13528254474745396, + "learning_rate": 3.0829481460270937e-06, + "loss": 0.7547, + "step": 3436 + }, + { + "epoch": 1.7054735013032145, + "grad_norm": 0.12969198957971417, + "learning_rate": 3.0819976937197767e-06, + "loss": 0.6953, + "step": 3437 + }, + { + "epoch": 1.705969964006454, + "grad_norm": 0.13427548997210034, + "learning_rate": 3.0810471524634432e-06, + "loss": 0.7189, + "step": 3438 + }, + { + "epoch": 1.7064664267096934, + "grad_norm": 0.1380792625189435, + "learning_rate": 3.080096522403369e-06, + "loss": 0.757, + "step": 3439 + }, + { + "epoch": 1.706962889412933, + "grad_norm": 0.12798109709025035, + "learning_rate": 3.079145803684843e-06, + "loss": 0.731, + "step": 3440 + }, + { + "epoch": 1.7074593521161723, + "grad_norm": 0.13775152334069862, + "learning_rate": 3.078194996453166e-06, + "loss": 0.7227, + "step": 3441 + }, + { + "epoch": 1.7079558148194116, + "grad_norm": 0.15389555123946358, + "learning_rate": 3.0772441008536545e-06, + "loss": 0.7067, + "step": 3442 + }, + { + "epoch": 1.7084522775226512, + "grad_norm": 0.13139328016325508, + "learning_rate": 3.0762931170316384e-06, + "loss": 0.7064, + "step": 3443 + }, + { + "epoch": 1.7089487402258905, + "grad_norm": 0.14093931277261854, + "learning_rate": 3.075342045132459e-06, + "loss": 0.7493, + "step": 3444 + }, + { + "epoch": 1.70944520292913, + "grad_norm": 0.1339425299662374, + "learning_rate": 3.0743908853014726e-06, + "loss": 0.7333, + "step": 3445 + }, + { + "epoch": 1.7099416656323694, + "grad_norm": 0.13334111715937227, + "learning_rate": 3.073439637684049e-06, + "loss": 0.6911, + "step": 3446 + }, + { + "epoch": 1.7104381283356087, + "grad_norm": 0.13485035304461898, + "learning_rate": 3.0724883024255713e-06, + "loss": 0.7105, + "step": 3447 + }, + { + "epoch": 1.710934591038848, + "grad_norm": 0.13875744459588266, + "learning_rate": 3.071536879671436e-06, + "loss": 0.6832, + "step": 3448 + }, + { + "epoch": 1.7114310537420876, + "grad_norm": 0.12935014733112105, + "learning_rate": 3.0705853695670535e-06, + "loss": 0.6948, + "step": 3449 + }, + { + "epoch": 1.7119275164453271, + "grad_norm": 0.13892908729449366, + "learning_rate": 3.0696337722578444e-06, + "loss": 0.7187, + "step": 3450 + }, + { + "epoch": 1.7124239791485665, + "grad_norm": 0.1311918745816494, + "learning_rate": 3.0686820878892472e-06, + "loss": 0.698, + "step": 3451 + }, + { + "epoch": 1.7129204418518058, + "grad_norm": 0.13310007177839275, + "learning_rate": 3.067730316606711e-06, + "loss": 0.7454, + "step": 3452 + }, + { + "epoch": 1.7134169045550451, + "grad_norm": 0.1407962461572666, + "learning_rate": 3.066778458555698e-06, + "loss": 0.7591, + "step": 3453 + }, + { + "epoch": 1.7139133672582847, + "grad_norm": 0.13091646981705454, + "learning_rate": 3.065826513881686e-06, + "loss": 0.7167, + "step": 3454 + }, + { + "epoch": 1.7144098299615242, + "grad_norm": 0.13923461390544403, + "learning_rate": 3.0648744827301636e-06, + "loss": 0.7779, + "step": 3455 + }, + { + "epoch": 1.7149062926647636, + "grad_norm": 0.1344779145205169, + "learning_rate": 3.0639223652466337e-06, + "loss": 0.7465, + "step": 3456 + }, + { + "epoch": 1.715402755368003, + "grad_norm": 0.12825800482719335, + "learning_rate": 3.0629701615766116e-06, + "loss": 0.6914, + "step": 3457 + }, + { + "epoch": 1.7158992180712422, + "grad_norm": 0.13450044556444432, + "learning_rate": 3.0620178718656272e-06, + "loss": 0.7268, + "step": 3458 + }, + { + "epoch": 1.7163956807744818, + "grad_norm": 0.13374849327239205, + "learning_rate": 3.061065496259222e-06, + "loss": 0.6687, + "step": 3459 + }, + { + "epoch": 1.7168921434777213, + "grad_norm": 0.1350801681751466, + "learning_rate": 3.060113034902952e-06, + "loss": 0.6999, + "step": 3460 + }, + { + "epoch": 1.7173886061809607, + "grad_norm": 0.13712402938215143, + "learning_rate": 3.0591604879423858e-06, + "loss": 0.7099, + "step": 3461 + }, + { + "epoch": 1.7178850688842, + "grad_norm": 0.1305628583173919, + "learning_rate": 3.058207855523104e-06, + "loss": 0.6846, + "step": 3462 + }, + { + "epoch": 1.7183815315874393, + "grad_norm": 0.13419917510098936, + "learning_rate": 3.057255137790701e-06, + "loss": 0.7718, + "step": 3463 + }, + { + "epoch": 1.718877994290679, + "grad_norm": 0.13680810835604765, + "learning_rate": 3.056302334890786e-06, + "loss": 0.7589, + "step": 3464 + }, + { + "epoch": 1.7193744569939184, + "grad_norm": 0.1336240292540668, + "learning_rate": 3.0553494469689792e-06, + "loss": 0.6608, + "step": 3465 + }, + { + "epoch": 1.7198709196971578, + "grad_norm": 0.1396188299452781, + "learning_rate": 3.054396474170913e-06, + "loss": 0.749, + "step": 3466 + }, + { + "epoch": 1.720367382400397, + "grad_norm": 0.13274972442712532, + "learning_rate": 3.0534434166422346e-06, + "loss": 0.7062, + "step": 3467 + }, + { + "epoch": 1.7208638451036364, + "grad_norm": 0.1381067587560184, + "learning_rate": 3.052490274528604e-06, + "loss": 0.7976, + "step": 3468 + }, + { + "epoch": 1.721360307806876, + "grad_norm": 0.1415613666037084, + "learning_rate": 3.0515370479756936e-06, + "loss": 0.6759, + "step": 3469 + }, + { + "epoch": 1.7218567705101155, + "grad_norm": 0.136114676951742, + "learning_rate": 3.0505837371291885e-06, + "loss": 0.6921, + "step": 3470 + }, + { + "epoch": 1.7223532332133549, + "grad_norm": 0.1356442686679228, + "learning_rate": 3.0496303421347872e-06, + "loss": 0.6642, + "step": 3471 + }, + { + "epoch": 1.7228496959165942, + "grad_norm": 0.1376358359049435, + "learning_rate": 3.0486768631382012e-06, + "loss": 0.7658, + "step": 3472 + }, + { + "epoch": 1.7233461586198335, + "grad_norm": 0.1392338301585101, + "learning_rate": 3.047723300285153e-06, + "loss": 0.7241, + "step": 3473 + }, + { + "epoch": 1.723842621323073, + "grad_norm": 0.1366470624963273, + "learning_rate": 3.046769653721381e-06, + "loss": 0.6675, + "step": 3474 + }, + { + "epoch": 1.7243390840263126, + "grad_norm": 0.13307292368981993, + "learning_rate": 3.045815923592634e-06, + "loss": 0.6673, + "step": 3475 + }, + { + "epoch": 1.724835546729552, + "grad_norm": 0.1357966319706256, + "learning_rate": 3.0448621100446753e-06, + "loss": 0.7633, + "step": 3476 + }, + { + "epoch": 1.7253320094327913, + "grad_norm": 0.14547912410392153, + "learning_rate": 3.0439082132232785e-06, + "loss": 0.7257, + "step": 3477 + }, + { + "epoch": 1.7258284721360306, + "grad_norm": 0.13325165665610728, + "learning_rate": 3.0429542332742322e-06, + "loss": 0.7046, + "step": 3478 + }, + { + "epoch": 1.7263249348392702, + "grad_norm": 0.1455990009531011, + "learning_rate": 3.042000170343337e-06, + "loss": 0.7195, + "step": 3479 + }, + { + "epoch": 1.7268213975425097, + "grad_norm": 0.13234995456293744, + "learning_rate": 3.0410460245764066e-06, + "loss": 0.6786, + "step": 3480 + }, + { + "epoch": 1.727317860245749, + "grad_norm": 0.13558320267676813, + "learning_rate": 3.0400917961192667e-06, + "loss": 0.7187, + "step": 3481 + }, + { + "epoch": 1.7278143229489884, + "grad_norm": 0.1309367275550789, + "learning_rate": 3.0391374851177547e-06, + "loss": 0.724, + "step": 3482 + }, + { + "epoch": 1.7283107856522277, + "grad_norm": 0.12849728447833228, + "learning_rate": 3.0381830917177225e-06, + "loss": 0.6885, + "step": 3483 + }, + { + "epoch": 1.7288072483554673, + "grad_norm": 0.13668857277562277, + "learning_rate": 3.0372286160650334e-06, + "loss": 0.7577, + "step": 3484 + }, + { + "epoch": 1.7293037110587068, + "grad_norm": 0.13650280495720618, + "learning_rate": 3.036274058305565e-06, + "loss": 0.7419, + "step": 3485 + }, + { + "epoch": 1.7298001737619462, + "grad_norm": 0.1294860482203652, + "learning_rate": 3.0353194185852052e-06, + "loss": 0.7442, + "step": 3486 + }, + { + "epoch": 1.7302966364651855, + "grad_norm": 0.13841166664415844, + "learning_rate": 3.0343646970498554e-06, + "loss": 0.7568, + "step": 3487 + }, + { + "epoch": 1.7307930991684248, + "grad_norm": 0.1293180189453792, + "learning_rate": 3.0334098938454293e-06, + "loss": 0.7047, + "step": 3488 + }, + { + "epoch": 1.7312895618716644, + "grad_norm": 0.13258057280123206, + "learning_rate": 3.0324550091178536e-06, + "loss": 0.7315, + "step": 3489 + }, + { + "epoch": 1.731786024574904, + "grad_norm": 0.13503628372669735, + "learning_rate": 3.031500043013067e-06, + "loss": 0.7302, + "step": 3490 + }, + { + "epoch": 1.7322824872781433, + "grad_norm": 0.13290187674269438, + "learning_rate": 3.0305449956770206e-06, + "loss": 0.7343, + "step": 3491 + }, + { + "epoch": 1.7327789499813826, + "grad_norm": 0.13540856252868388, + "learning_rate": 3.0295898672556785e-06, + "loss": 0.7448, + "step": 3492 + }, + { + "epoch": 1.733275412684622, + "grad_norm": 0.13564514354413926, + "learning_rate": 3.0286346578950165e-06, + "loss": 0.703, + "step": 3493 + }, + { + "epoch": 1.7337718753878615, + "grad_norm": 0.1377806848578677, + "learning_rate": 3.0276793677410226e-06, + "loss": 0.7445, + "step": 3494 + }, + { + "epoch": 1.734268338091101, + "grad_norm": 0.1357491981149147, + "learning_rate": 3.0267239969396983e-06, + "loss": 0.7689, + "step": 3495 + }, + { + "epoch": 1.7347648007943404, + "grad_norm": 0.1278410172706573, + "learning_rate": 3.0257685456370573e-06, + "loss": 0.7004, + "step": 3496 + }, + { + "epoch": 1.7352612634975797, + "grad_norm": 0.1314164181234785, + "learning_rate": 3.024813013979123e-06, + "loss": 0.6591, + "step": 3497 + }, + { + "epoch": 1.735757726200819, + "grad_norm": 0.1455338423625045, + "learning_rate": 3.0238574021119356e-06, + "loss": 0.74, + "step": 3498 + }, + { + "epoch": 1.7362541889040586, + "grad_norm": 0.1334665037915578, + "learning_rate": 3.0229017101815424e-06, + "loss": 0.7069, + "step": 3499 + }, + { + "epoch": 1.7367506516072981, + "grad_norm": 0.13544177303837263, + "learning_rate": 3.0219459383340073e-06, + "loss": 0.7632, + "step": 3500 + }, + { + "epoch": 1.7372471143105375, + "grad_norm": 0.13969483295131455, + "learning_rate": 3.0209900867154042e-06, + "loss": 0.7542, + "step": 3501 + }, + { + "epoch": 1.7377435770137768, + "grad_norm": 0.13416029469080804, + "learning_rate": 3.02003415547182e-06, + "loss": 0.6914, + "step": 3502 + }, + { + "epoch": 1.7382400397170161, + "grad_norm": 0.1342512889947578, + "learning_rate": 3.0190781447493526e-06, + "loss": 0.7125, + "step": 3503 + }, + { + "epoch": 1.7387365024202557, + "grad_norm": 0.14318589298841014, + "learning_rate": 3.018122054694115e-06, + "loss": 0.7625, + "step": 3504 + }, + { + "epoch": 1.7392329651234952, + "grad_norm": 0.13889004826208431, + "learning_rate": 3.0171658854522274e-06, + "loss": 0.7114, + "step": 3505 + }, + { + "epoch": 1.7397294278267346, + "grad_norm": 0.13605943240627727, + "learning_rate": 3.016209637169827e-06, + "loss": 0.7414, + "step": 3506 + }, + { + "epoch": 1.740225890529974, + "grad_norm": 0.13235701067519334, + "learning_rate": 3.01525330999306e-06, + "loss": 0.7208, + "step": 3507 + }, + { + "epoch": 1.7407223532332132, + "grad_norm": 0.135326876994932, + "learning_rate": 3.0142969040680865e-06, + "loss": 0.7344, + "step": 3508 + }, + { + "epoch": 1.7412188159364528, + "grad_norm": 0.13477736579977354, + "learning_rate": 3.0133404195410764e-06, + "loss": 0.7326, + "step": 3509 + }, + { + "epoch": 1.7417152786396923, + "grad_norm": 0.13284231516066178, + "learning_rate": 3.0123838565582147e-06, + "loss": 0.7119, + "step": 3510 + }, + { + "epoch": 1.7422117413429317, + "grad_norm": 0.13238135868096979, + "learning_rate": 3.011427215265696e-06, + "loss": 0.7018, + "step": 3511 + }, + { + "epoch": 1.742708204046171, + "grad_norm": 0.13474513462013873, + "learning_rate": 3.0104704958097275e-06, + "loss": 0.7405, + "step": 3512 + }, + { + "epoch": 1.7432046667494103, + "grad_norm": 0.1302603109994265, + "learning_rate": 3.009513698336529e-06, + "loss": 0.7194, + "step": 3513 + }, + { + "epoch": 1.7437011294526499, + "grad_norm": 0.13946924173551373, + "learning_rate": 3.00855682299233e-06, + "loss": 0.7454, + "step": 3514 + }, + { + "epoch": 1.7441975921558894, + "grad_norm": 0.14022806898060922, + "learning_rate": 3.0075998699233757e-06, + "loss": 0.7258, + "step": 3515 + }, + { + "epoch": 1.7446940548591288, + "grad_norm": 0.14276302517939646, + "learning_rate": 3.006642839275919e-06, + "loss": 0.7353, + "step": 3516 + }, + { + "epoch": 1.745190517562368, + "grad_norm": 0.13440824467755944, + "learning_rate": 3.0056857311962286e-06, + "loss": 0.6975, + "step": 3517 + }, + { + "epoch": 1.7456869802656074, + "grad_norm": 0.13069918155058813, + "learning_rate": 3.0047285458305818e-06, + "loss": 0.7307, + "step": 3518 + }, + { + "epoch": 1.746183442968847, + "grad_norm": 0.13013866848398856, + "learning_rate": 3.0037712833252696e-06, + "loss": 0.7075, + "step": 3519 + }, + { + "epoch": 1.7466799056720865, + "grad_norm": 0.13588085761716664, + "learning_rate": 3.0028139438265946e-06, + "loss": 0.7028, + "step": 3520 + }, + { + "epoch": 1.7471763683753259, + "grad_norm": 0.1379722426469276, + "learning_rate": 3.00185652748087e-06, + "loss": 0.7558, + "step": 3521 + }, + { + "epoch": 1.7476728310785652, + "grad_norm": 0.13440744873614507, + "learning_rate": 3.000899034434421e-06, + "loss": 0.6893, + "step": 3522 + }, + { + "epoch": 1.7481692937818045, + "grad_norm": 0.13573972148872435, + "learning_rate": 2.9999414648335866e-06, + "loss": 0.7284, + "step": 3523 + }, + { + "epoch": 1.748665756485044, + "grad_norm": 0.12921771068329435, + "learning_rate": 2.9989838188247157e-06, + "loss": 0.7153, + "step": 3524 + }, + { + "epoch": 1.7491622191882836, + "grad_norm": 0.128812429516214, + "learning_rate": 2.998026096554168e-06, + "loss": 0.7018, + "step": 3525 + }, + { + "epoch": 1.749658681891523, + "grad_norm": 0.13351254077010827, + "learning_rate": 2.9970682981683165e-06, + "loss": 0.7398, + "step": 3526 + }, + { + "epoch": 1.7501551445947623, + "grad_norm": 0.13254352974014932, + "learning_rate": 2.9961104238135457e-06, + "loss": 0.7104, + "step": 3527 + }, + { + "epoch": 1.7506516072980016, + "grad_norm": 0.13040410881238124, + "learning_rate": 2.9951524736362513e-06, + "loss": 0.6791, + "step": 3528 + }, + { + "epoch": 1.7506516072980016, + "eval_loss": 0.732168436050415, + "eval_runtime": 135.7964, + "eval_samples_per_second": 223.519, + "eval_steps_per_second": 27.946, + "step": 3528 + }, + { + "epoch": 1.7511480700012412, + "grad_norm": 0.1454158991301188, + "learning_rate": 2.9941944477828405e-06, + "loss": 0.729, + "step": 3529 + }, + { + "epoch": 1.7516445327044807, + "grad_norm": 0.14264344607881654, + "learning_rate": 2.9932363463997325e-06, + "loss": 0.761, + "step": 3530 + }, + { + "epoch": 1.75214099540772, + "grad_norm": 0.13191391058050855, + "learning_rate": 2.992278169633357e-06, + "loss": 0.6719, + "step": 3531 + }, + { + "epoch": 1.7526374581109594, + "grad_norm": 0.13456024392134855, + "learning_rate": 2.9913199176301567e-06, + "loss": 0.7191, + "step": 3532 + }, + { + "epoch": 1.7531339208141987, + "grad_norm": 0.12996595723949278, + "learning_rate": 2.990361590536584e-06, + "loss": 0.6824, + "step": 3533 + }, + { + "epoch": 1.7536303835174383, + "grad_norm": 0.1328036765145108, + "learning_rate": 2.9894031884991047e-06, + "loss": 0.7057, + "step": 3534 + }, + { + "epoch": 1.7541268462206778, + "grad_norm": 0.13831131399589613, + "learning_rate": 2.9884447116641955e-06, + "loss": 0.697, + "step": 3535 + }, + { + "epoch": 1.7546233089239172, + "grad_norm": 0.1421695480209381, + "learning_rate": 2.987486160178344e-06, + "loss": 0.7423, + "step": 3536 + }, + { + "epoch": 1.7551197716271565, + "grad_norm": 0.12821462370317024, + "learning_rate": 2.9865275341880484e-06, + "loss": 0.7165, + "step": 3537 + }, + { + "epoch": 1.7556162343303958, + "grad_norm": 0.1420145829606162, + "learning_rate": 2.9855688338398202e-06, + "loss": 0.6948, + "step": 3538 + }, + { + "epoch": 1.7561126970336354, + "grad_norm": 0.13455255604118072, + "learning_rate": 2.9846100592801815e-06, + "loss": 0.7232, + "step": 3539 + }, + { + "epoch": 1.756609159736875, + "grad_norm": 0.13103602177339743, + "learning_rate": 2.9836512106556655e-06, + "loss": 0.7248, + "step": 3540 + }, + { + "epoch": 1.7571056224401143, + "grad_norm": 0.1384857291962389, + "learning_rate": 2.982692288112816e-06, + "loss": 0.7303, + "step": 3541 + }, + { + "epoch": 1.7576020851433536, + "grad_norm": 0.16056289613315078, + "learning_rate": 2.98173329179819e-06, + "loss": 0.7049, + "step": 3542 + }, + { + "epoch": 1.758098547846593, + "grad_norm": 0.1344941324110072, + "learning_rate": 2.9807742218583547e-06, + "loss": 0.691, + "step": 3543 + }, + { + "epoch": 1.7585950105498325, + "grad_norm": 0.1354748157954558, + "learning_rate": 2.9798150784398885e-06, + "loss": 0.7215, + "step": 3544 + }, + { + "epoch": 1.759091473253072, + "grad_norm": 0.14122504918334078, + "learning_rate": 2.9788558616893796e-06, + "loss": 0.7167, + "step": 3545 + }, + { + "epoch": 1.7595879359563114, + "grad_norm": 0.14019815087178897, + "learning_rate": 2.9778965717534314e-06, + "loss": 0.7125, + "step": 3546 + }, + { + "epoch": 1.7600843986595507, + "grad_norm": 0.13688281809385325, + "learning_rate": 2.976937208778654e-06, + "loss": 0.715, + "step": 3547 + }, + { + "epoch": 1.76058086136279, + "grad_norm": 0.14612296075479175, + "learning_rate": 2.975977772911671e-06, + "loss": 0.7693, + "step": 3548 + }, + { + "epoch": 1.7610773240660296, + "grad_norm": 0.13373675241491112, + "learning_rate": 2.9750182642991175e-06, + "loss": 0.7149, + "step": 3549 + }, + { + "epoch": 1.7615737867692691, + "grad_norm": 0.13249339511074687, + "learning_rate": 2.974058683087639e-06, + "loss": 0.7176, + "step": 3550 + }, + { + "epoch": 1.7620702494725085, + "grad_norm": 0.1445155372590999, + "learning_rate": 2.9730990294238914e-06, + "loss": 0.7565, + "step": 3551 + }, + { + "epoch": 1.7625667121757478, + "grad_norm": 0.1301812819229513, + "learning_rate": 2.972139303454543e-06, + "loss": 0.7337, + "step": 3552 + }, + { + "epoch": 1.7630631748789871, + "grad_norm": 0.13813116954578447, + "learning_rate": 2.971179505326272e-06, + "loss": 0.7344, + "step": 3553 + }, + { + "epoch": 1.7635596375822267, + "grad_norm": 0.13748336922899862, + "learning_rate": 2.970219635185769e-06, + "loss": 0.781, + "step": 3554 + }, + { + "epoch": 1.764056100285466, + "grad_norm": 0.13434117881943183, + "learning_rate": 2.969259693179733e-06, + "loss": 0.7186, + "step": 3555 + }, + { + "epoch": 1.7645525629887056, + "grad_norm": 0.13228896105397034, + "learning_rate": 2.968299679454878e-06, + "loss": 0.7308, + "step": 3556 + }, + { + "epoch": 1.7650490256919449, + "grad_norm": 0.1279445994151255, + "learning_rate": 2.967339594157925e-06, + "loss": 0.7249, + "step": 3557 + }, + { + "epoch": 1.7655454883951842, + "grad_norm": 0.13875252416949538, + "learning_rate": 2.9663794374356082e-06, + "loss": 0.7315, + "step": 3558 + }, + { + "epoch": 1.7660419510984238, + "grad_norm": 0.13828288612620065, + "learning_rate": 2.9654192094346716e-06, + "loss": 0.6718, + "step": 3559 + }, + { + "epoch": 1.766538413801663, + "grad_norm": 0.13503162566112942, + "learning_rate": 2.9644589103018728e-06, + "loss": 0.7089, + "step": 3560 + }, + { + "epoch": 1.7670348765049027, + "grad_norm": 0.12961214402076365, + "learning_rate": 2.9634985401839754e-06, + "loss": 0.7265, + "step": 3561 + }, + { + "epoch": 1.767531339208142, + "grad_norm": 0.1376599301401424, + "learning_rate": 2.9625380992277585e-06, + "loss": 0.7361, + "step": 3562 + }, + { + "epoch": 1.7680278019113813, + "grad_norm": 0.1315003737365444, + "learning_rate": 2.961577587580009e-06, + "loss": 0.7222, + "step": 3563 + }, + { + "epoch": 1.7685242646146209, + "grad_norm": 0.13997334248150592, + "learning_rate": 2.960617005387526e-06, + "loss": 0.7678, + "step": 3564 + }, + { + "epoch": 1.7690207273178602, + "grad_norm": 0.1377189418395456, + "learning_rate": 2.9596563527971197e-06, + "loss": 0.7488, + "step": 3565 + }, + { + "epoch": 1.7695171900210998, + "grad_norm": 0.13949649187035196, + "learning_rate": 2.9586956299556093e-06, + "loss": 0.7356, + "step": 3566 + }, + { + "epoch": 1.770013652724339, + "grad_norm": 0.13072149028094537, + "learning_rate": 2.957734837009827e-06, + "loss": 0.7439, + "step": 3567 + }, + { + "epoch": 1.7705101154275784, + "grad_norm": 0.13314492915960163, + "learning_rate": 2.9567739741066135e-06, + "loss": 0.7131, + "step": 3568 + }, + { + "epoch": 1.771006578130818, + "grad_norm": 0.138259104473977, + "learning_rate": 2.955813041392822e-06, + "loss": 0.7083, + "step": 3569 + }, + { + "epoch": 1.7715030408340573, + "grad_norm": 0.13460619412681396, + "learning_rate": 2.9548520390153157e-06, + "loss": 0.7247, + "step": 3570 + }, + { + "epoch": 1.7719995035372968, + "grad_norm": 0.13188160151808176, + "learning_rate": 2.9538909671209683e-06, + "loss": 0.7327, + "step": 3571 + }, + { + "epoch": 1.7724959662405362, + "grad_norm": 0.13915943835734876, + "learning_rate": 2.952929825856664e-06, + "loss": 0.7245, + "step": 3572 + }, + { + "epoch": 1.7729924289437755, + "grad_norm": 0.13113442999553518, + "learning_rate": 2.9519686153692984e-06, + "loss": 0.6766, + "step": 3573 + }, + { + "epoch": 1.773488891647015, + "grad_norm": 0.15563993017589076, + "learning_rate": 2.9510073358057763e-06, + "loss": 0.778, + "step": 3574 + }, + { + "epoch": 1.7739853543502544, + "grad_norm": 0.13605674921389552, + "learning_rate": 2.950045987313014e-06, + "loss": 0.7889, + "step": 3575 + }, + { + "epoch": 1.774481817053494, + "grad_norm": 0.13255856624671072, + "learning_rate": 2.949084570037939e-06, + "loss": 0.7268, + "step": 3576 + }, + { + "epoch": 1.7749782797567333, + "grad_norm": 0.1425167455509219, + "learning_rate": 2.948123084127488e-06, + "loss": 0.779, + "step": 3577 + }, + { + "epoch": 1.7754747424599726, + "grad_norm": 0.13366479508025797, + "learning_rate": 2.947161529728609e-06, + "loss": 0.6817, + "step": 3578 + }, + { + "epoch": 1.7759712051632122, + "grad_norm": 0.13634858914868714, + "learning_rate": 2.946199906988259e-06, + "loss": 0.7171, + "step": 3579 + }, + { + "epoch": 1.7764676678664515, + "grad_norm": 0.1339820519944257, + "learning_rate": 2.9452382160534075e-06, + "loss": 0.6727, + "step": 3580 + }, + { + "epoch": 1.776964130569691, + "grad_norm": 0.13103750913100626, + "learning_rate": 2.9442764570710343e-06, + "loss": 0.694, + "step": 3581 + }, + { + "epoch": 1.7774605932729304, + "grad_norm": 0.1313014217590884, + "learning_rate": 2.943314630188127e-06, + "loss": 0.7572, + "step": 3582 + }, + { + "epoch": 1.7779570559761697, + "grad_norm": 0.13018315808842246, + "learning_rate": 2.942352735551688e-06, + "loss": 0.7372, + "step": 3583 + }, + { + "epoch": 1.7784535186794093, + "grad_norm": 0.1317368490396673, + "learning_rate": 2.9413907733087255e-06, + "loss": 0.7442, + "step": 3584 + }, + { + "epoch": 1.7789499813826486, + "grad_norm": 0.14221916855971456, + "learning_rate": 2.9404287436062596e-06, + "loss": 0.7101, + "step": 3585 + }, + { + "epoch": 1.7794464440858881, + "grad_norm": 0.13279652857342705, + "learning_rate": 2.9394666465913225e-06, + "loss": 0.7166, + "step": 3586 + }, + { + "epoch": 1.7799429067891275, + "grad_norm": 0.13104555294939943, + "learning_rate": 2.9385044824109544e-06, + "loss": 0.6961, + "step": 3587 + }, + { + "epoch": 1.7804393694923668, + "grad_norm": 0.13924421190691488, + "learning_rate": 2.937542251212207e-06, + "loss": 0.7798, + "step": 3588 + }, + { + "epoch": 1.7809358321956061, + "grad_norm": 0.13257146284425395, + "learning_rate": 2.936579953142143e-06, + "loss": 0.7222, + "step": 3589 + }, + { + "epoch": 1.7814322948988457, + "grad_norm": 0.13510842196780357, + "learning_rate": 2.935617588347832e-06, + "loss": 0.6802, + "step": 3590 + }, + { + "epoch": 1.7819287576020852, + "grad_norm": 0.1370255691430651, + "learning_rate": 2.9346551569763584e-06, + "loss": 0.7058, + "step": 3591 + }, + { + "epoch": 1.7824252203053246, + "grad_norm": 0.13103573445504976, + "learning_rate": 2.9336926591748116e-06, + "loss": 0.7076, + "step": 3592 + }, + { + "epoch": 1.782921683008564, + "grad_norm": 0.1332868798807041, + "learning_rate": 2.932730095090297e-06, + "loss": 0.7525, + "step": 3593 + }, + { + "epoch": 1.7834181457118032, + "grad_norm": 0.13239816333094517, + "learning_rate": 2.931767464869926e-06, + "loss": 0.7118, + "step": 3594 + }, + { + "epoch": 1.7839146084150428, + "grad_norm": 0.13806112422697028, + "learning_rate": 2.93080476866082e-06, + "loss": 0.6911, + "step": 3595 + }, + { + "epoch": 1.7844110711182823, + "grad_norm": 0.13143745347123123, + "learning_rate": 2.9298420066101123e-06, + "loss": 0.7114, + "step": 3596 + }, + { + "epoch": 1.7849075338215217, + "grad_norm": 0.1303545454261341, + "learning_rate": 2.928879178864946e-06, + "loss": 0.7026, + "step": 3597 + }, + { + "epoch": 1.785403996524761, + "grad_norm": 0.13284060964480524, + "learning_rate": 2.9279162855724736e-06, + "loss": 0.7719, + "step": 3598 + }, + { + "epoch": 1.7859004592280003, + "grad_norm": 0.13126700923262716, + "learning_rate": 2.926953326879859e-06, + "loss": 0.745, + "step": 3599 + }, + { + "epoch": 1.78639692193124, + "grad_norm": 0.13808740540622605, + "learning_rate": 2.9259903029342733e-06, + "loss": 0.7492, + "step": 3600 + }, + { + "epoch": 1.7868933846344794, + "grad_norm": 0.13223911487658072, + "learning_rate": 2.9250272138829004e-06, + "loss": 0.7115, + "step": 3601 + }, + { + "epoch": 1.7873898473377188, + "grad_norm": 0.13756497219754027, + "learning_rate": 2.9240640598729325e-06, + "loss": 0.6888, + "step": 3602 + }, + { + "epoch": 1.787886310040958, + "grad_norm": 0.12838540835989878, + "learning_rate": 2.923100841051572e-06, + "loss": 0.695, + "step": 3603 + }, + { + "epoch": 1.7883827727441974, + "grad_norm": 0.1345217155143832, + "learning_rate": 2.922137557566032e-06, + "loss": 0.7352, + "step": 3604 + }, + { + "epoch": 1.788879235447437, + "grad_norm": 0.13011116824419314, + "learning_rate": 2.9211742095635353e-06, + "loss": 0.715, + "step": 3605 + }, + { + "epoch": 1.7893756981506765, + "grad_norm": 0.13345770601068715, + "learning_rate": 2.9202107971913135e-06, + "loss": 0.6526, + "step": 3606 + }, + { + "epoch": 1.7898721608539159, + "grad_norm": 0.13058633473314155, + "learning_rate": 2.9192473205966086e-06, + "loss": 0.7298, + "step": 3607 + }, + { + "epoch": 1.7903686235571552, + "grad_norm": 0.13576890050935692, + "learning_rate": 2.918283779926673e-06, + "loss": 0.7379, + "step": 3608 + }, + { + "epoch": 1.7908650862603945, + "grad_norm": 0.1357470033215107, + "learning_rate": 2.917320175328769e-06, + "loss": 0.7684, + "step": 3609 + }, + { + "epoch": 1.791361548963634, + "grad_norm": 0.12842283154879058, + "learning_rate": 2.9163565069501665e-06, + "loss": 0.7074, + "step": 3610 + }, + { + "epoch": 1.7918580116668736, + "grad_norm": 0.1315525180161788, + "learning_rate": 2.915392774938148e-06, + "loss": 0.7179, + "step": 3611 + }, + { + "epoch": 1.792354474370113, + "grad_norm": 0.13320229301947734, + "learning_rate": 2.914428979440004e-06, + "loss": 0.7488, + "step": 3612 + }, + { + "epoch": 1.7928509370733523, + "grad_norm": 0.17742426766768907, + "learning_rate": 2.9134651206030356e-06, + "loss": 0.6971, + "step": 3613 + }, + { + "epoch": 1.7933473997765916, + "grad_norm": 0.13687894210660118, + "learning_rate": 2.9125011985745526e-06, + "loss": 0.7579, + "step": 3614 + }, + { + "epoch": 1.7938438624798312, + "grad_norm": 0.14356923426142504, + "learning_rate": 2.911537213501876e-06, + "loss": 0.8023, + "step": 3615 + }, + { + "epoch": 1.7943403251830707, + "grad_norm": 0.13255693124637588, + "learning_rate": 2.9105731655323345e-06, + "loss": 0.7161, + "step": 3616 + }, + { + "epoch": 1.79483678788631, + "grad_norm": 0.1340919080282845, + "learning_rate": 2.9096090548132678e-06, + "loss": 0.698, + "step": 3617 + }, + { + "epoch": 1.7953332505895494, + "grad_norm": 0.13702138442017256, + "learning_rate": 2.908644881492024e-06, + "loss": 0.7523, + "step": 3618 + }, + { + "epoch": 1.7958297132927887, + "grad_norm": 0.1353149372278838, + "learning_rate": 2.9076806457159628e-06, + "loss": 0.7048, + "step": 3619 + }, + { + "epoch": 1.7963261759960283, + "grad_norm": 0.13799204504298163, + "learning_rate": 2.9067163476324513e-06, + "loss": 0.7481, + "step": 3620 + }, + { + "epoch": 1.7968226386992678, + "grad_norm": 0.12790377926586102, + "learning_rate": 2.905751987388868e-06, + "loss": 0.6927, + "step": 3621 + }, + { + "epoch": 1.7973191014025072, + "grad_norm": 0.13965087986871189, + "learning_rate": 2.904787565132598e-06, + "loss": 0.7361, + "step": 3622 + }, + { + "epoch": 1.7978155641057465, + "grad_norm": 0.1336271147899048, + "learning_rate": 2.903823081011039e-06, + "loss": 0.7112, + "step": 3623 + }, + { + "epoch": 1.7983120268089858, + "grad_norm": 0.13734805652624563, + "learning_rate": 2.9028585351715977e-06, + "loss": 0.7405, + "step": 3624 + }, + { + "epoch": 1.7988084895122254, + "grad_norm": 0.13407436084230553, + "learning_rate": 2.9018939277616887e-06, + "loss": 0.7503, + "step": 3625 + }, + { + "epoch": 1.799304952215465, + "grad_norm": 0.13291972705380428, + "learning_rate": 2.9009292589287357e-06, + "loss": 0.7089, + "step": 3626 + }, + { + "epoch": 1.7998014149187043, + "grad_norm": 0.13331187297256578, + "learning_rate": 2.899964528820175e-06, + "loss": 0.7445, + "step": 3627 + }, + { + "epoch": 1.8002978776219436, + "grad_norm": 0.13026121253634557, + "learning_rate": 2.8989997375834485e-06, + "loss": 0.732, + "step": 3628 + }, + { + "epoch": 1.800794340325183, + "grad_norm": 0.12960005121948162, + "learning_rate": 2.8980348853660096e-06, + "loss": 0.7023, + "step": 3629 + }, + { + "epoch": 1.8012908030284225, + "grad_norm": 0.13361582398506516, + "learning_rate": 2.89706997231532e-06, + "loss": 0.7045, + "step": 3630 + }, + { + "epoch": 1.801787265731662, + "grad_norm": 0.13469473169085272, + "learning_rate": 2.8961049985788524e-06, + "loss": 0.7225, + "step": 3631 + }, + { + "epoch": 1.8022837284349014, + "grad_norm": 0.13319255321122753, + "learning_rate": 2.8951399643040867e-06, + "loss": 0.7368, + "step": 3632 + }, + { + "epoch": 1.8027801911381407, + "grad_norm": 0.12936117726098809, + "learning_rate": 2.894174869638513e-06, + "loss": 0.7553, + "step": 3633 + }, + { + "epoch": 1.80327665384138, + "grad_norm": 0.12979563347690065, + "learning_rate": 2.8932097147296308e-06, + "loss": 0.7155, + "step": 3634 + }, + { + "epoch": 1.8037731165446196, + "grad_norm": 0.1309079661022396, + "learning_rate": 2.8922444997249477e-06, + "loss": 0.7244, + "step": 3635 + }, + { + "epoch": 1.8042695792478591, + "grad_norm": 0.1427263829925681, + "learning_rate": 2.891279224771982e-06, + "loss": 0.7333, + "step": 3636 + }, + { + "epoch": 1.8047660419510985, + "grad_norm": 0.1407202293176257, + "learning_rate": 2.8903138900182615e-06, + "loss": 0.7456, + "step": 3637 + }, + { + "epoch": 1.8052625046543378, + "grad_norm": 0.16815294681624532, + "learning_rate": 2.88934849561132e-06, + "loss": 0.6916, + "step": 3638 + }, + { + "epoch": 1.8057589673575771, + "grad_norm": 0.1424105389357212, + "learning_rate": 2.8883830416987043e-06, + "loss": 0.7531, + "step": 3639 + }, + { + "epoch": 1.8062554300608167, + "grad_norm": 0.13301046552150622, + "learning_rate": 2.887417528427967e-06, + "loss": 0.7107, + "step": 3640 + }, + { + "epoch": 1.8067518927640562, + "grad_norm": 0.13697780218782096, + "learning_rate": 2.8864519559466738e-06, + "loss": 0.7519, + "step": 3641 + }, + { + "epoch": 1.8072483554672956, + "grad_norm": 0.13703156295639382, + "learning_rate": 2.8854863244023945e-06, + "loss": 0.7769, + "step": 3642 + }, + { + "epoch": 1.807744818170535, + "grad_norm": 0.1322398802236882, + "learning_rate": 2.884520633942712e-06, + "loss": 0.6999, + "step": 3643 + }, + { + "epoch": 1.8082412808737742, + "grad_norm": 0.13492661813080623, + "learning_rate": 2.8835548847152143e-06, + "loss": 0.7147, + "step": 3644 + }, + { + "epoch": 1.8087377435770138, + "grad_norm": 0.13231443100813342, + "learning_rate": 2.8825890768675035e-06, + "loss": 0.6931, + "step": 3645 + }, + { + "epoch": 1.8092342062802533, + "grad_norm": 0.13342027348327531, + "learning_rate": 2.8816232105471864e-06, + "loss": 0.7234, + "step": 3646 + }, + { + "epoch": 1.8097306689834927, + "grad_norm": 0.13502995129426054, + "learning_rate": 2.8806572859018806e-06, + "loss": 0.7344, + "step": 3647 + }, + { + "epoch": 1.810227131686732, + "grad_norm": 0.13216879878124957, + "learning_rate": 2.8796913030792116e-06, + "loss": 0.7127, + "step": 3648 + }, + { + "epoch": 1.8107235943899713, + "grad_norm": 0.1389960465198068, + "learning_rate": 2.878725262226816e-06, + "loss": 0.7331, + "step": 3649 + }, + { + "epoch": 1.8112200570932109, + "grad_norm": 0.133130506783979, + "learning_rate": 2.8777591634923353e-06, + "loss": 0.7085, + "step": 3650 + }, + { + "epoch": 1.8117165197964504, + "grad_norm": 0.12953910081868406, + "learning_rate": 2.8767930070234233e-06, + "loss": 0.7059, + "step": 3651 + }, + { + "epoch": 1.8122129824996898, + "grad_norm": 0.13304246175165207, + "learning_rate": 2.8758267929677418e-06, + "loss": 0.7506, + "step": 3652 + }, + { + "epoch": 1.812709445202929, + "grad_norm": 0.13171896309203177, + "learning_rate": 2.874860521472962e-06, + "loss": 0.704, + "step": 3653 + }, + { + "epoch": 1.8132059079061684, + "grad_norm": 0.12867716679784078, + "learning_rate": 2.873894192686761e-06, + "loss": 0.7, + "step": 3654 + }, + { + "epoch": 1.813702370609408, + "grad_norm": 0.13335540704857535, + "learning_rate": 2.8729278067568272e-06, + "loss": 0.7251, + "step": 3655 + }, + { + "epoch": 1.8141988333126475, + "grad_norm": 0.1335222309537191, + "learning_rate": 2.871961363830858e-06, + "loss": 0.753, + "step": 3656 + }, + { + "epoch": 1.8146952960158869, + "grad_norm": 0.13606044352762497, + "learning_rate": 2.8709948640565582e-06, + "loss": 0.7602, + "step": 3657 + }, + { + "epoch": 1.8151917587191262, + "grad_norm": 0.13248054985778066, + "learning_rate": 2.870028307581642e-06, + "loss": 0.6717, + "step": 3658 + }, + { + "epoch": 1.8156882214223655, + "grad_norm": 0.12524368516267864, + "learning_rate": 2.8690616945538324e-06, + "loss": 0.6575, + "step": 3659 + }, + { + "epoch": 1.816184684125605, + "grad_norm": 0.12893746506812964, + "learning_rate": 2.8680950251208595e-06, + "loss": 0.6987, + "step": 3660 + }, + { + "epoch": 1.8166811468288446, + "grad_norm": 0.13668691367646468, + "learning_rate": 2.8671282994304637e-06, + "loss": 0.6763, + "step": 3661 + }, + { + "epoch": 1.817177609532084, + "grad_norm": 0.13337887117478803, + "learning_rate": 2.8661615176303944e-06, + "loss": 0.721, + "step": 3662 + }, + { + "epoch": 1.8176740722353233, + "grad_norm": 0.137232017944419, + "learning_rate": 2.865194679868408e-06, + "loss": 0.7203, + "step": 3663 + }, + { + "epoch": 1.8181705349385626, + "grad_norm": 0.1283303529499583, + "learning_rate": 2.8642277862922702e-06, + "loss": 0.7107, + "step": 3664 + }, + { + "epoch": 1.8186669976418022, + "grad_norm": 0.14113427375297083, + "learning_rate": 2.8632608370497555e-06, + "loss": 0.7671, + "step": 3665 + }, + { + "epoch": 1.8191634603450417, + "grad_norm": 0.1337368727892662, + "learning_rate": 2.862293832288646e-06, + "loss": 0.6931, + "step": 3666 + }, + { + "epoch": 1.819659923048281, + "grad_norm": 0.1305593466225775, + "learning_rate": 2.8613267721567334e-06, + "loss": 0.7555, + "step": 3667 + }, + { + "epoch": 1.8201563857515204, + "grad_norm": 0.13071527307271635, + "learning_rate": 2.8603596568018166e-06, + "loss": 0.7269, + "step": 3668 + }, + { + "epoch": 1.8206528484547597, + "grad_norm": 0.13020862347403772, + "learning_rate": 2.859392486371705e-06, + "loss": 0.7363, + "step": 3669 + }, + { + "epoch": 1.8211493111579993, + "grad_norm": 0.13695112485546784, + "learning_rate": 2.8584252610142133e-06, + "loss": 0.763, + "step": 3670 + }, + { + "epoch": 1.8216457738612388, + "grad_norm": 0.13490142562797158, + "learning_rate": 2.8574579808771676e-06, + "loss": 0.6992, + "step": 3671 + }, + { + "epoch": 1.8221422365644782, + "grad_norm": 0.12665466211192675, + "learning_rate": 2.8564906461084014e-06, + "loss": 0.6498, + "step": 3672 + }, + { + "epoch": 1.8226386992677175, + "grad_norm": 0.13516985312128005, + "learning_rate": 2.855523256855756e-06, + "loss": 0.7067, + "step": 3673 + }, + { + "epoch": 1.8231351619709568, + "grad_norm": 0.1288573931245718, + "learning_rate": 2.8545558132670804e-06, + "loss": 0.6909, + "step": 3674 + }, + { + "epoch": 1.8236316246741964, + "grad_norm": 0.12713861994971612, + "learning_rate": 2.8535883154902345e-06, + "loss": 0.6958, + "step": 3675 + }, + { + "epoch": 1.824128087377436, + "grad_norm": 0.13381135557162596, + "learning_rate": 2.852620763673083e-06, + "loss": 0.7299, + "step": 3676 + }, + { + "epoch": 1.8246245500806753, + "grad_norm": 0.1319129422115045, + "learning_rate": 2.851653157963502e-06, + "loss": 0.6831, + "step": 3677 + }, + { + "epoch": 1.8251210127839146, + "grad_norm": 0.1394166318448852, + "learning_rate": 2.850685498509374e-06, + "loss": 0.7536, + "step": 3678 + }, + { + "epoch": 1.825617475487154, + "grad_norm": 0.13103025516203937, + "learning_rate": 2.84971778545859e-06, + "loss": 0.6629, + "step": 3679 + }, + { + "epoch": 1.8261139381903935, + "grad_norm": 0.1309333120946672, + "learning_rate": 2.8487500189590513e-06, + "loss": 0.7068, + "step": 3680 + }, + { + "epoch": 1.826610400893633, + "grad_norm": 0.12983925391000833, + "learning_rate": 2.847782199158663e-06, + "loss": 0.7237, + "step": 3681 + }, + { + "epoch": 1.8271068635968724, + "grad_norm": 0.13732026044139703, + "learning_rate": 2.8468143262053416e-06, + "loss": 0.7156, + "step": 3682 + }, + { + "epoch": 1.8276033263001117, + "grad_norm": 0.1364540970178744, + "learning_rate": 2.8458464002470114e-06, + "loss": 0.7403, + "step": 3683 + }, + { + "epoch": 1.828099789003351, + "grad_norm": 0.13692850761642453, + "learning_rate": 2.844878421431604e-06, + "loss": 0.6859, + "step": 3684 + }, + { + "epoch": 1.8285962517065906, + "grad_norm": 0.13088408167168533, + "learning_rate": 2.8439103899070595e-06, + "loss": 0.6676, + "step": 3685 + }, + { + "epoch": 1.8290927144098301, + "grad_norm": 0.12989265875764777, + "learning_rate": 2.8429423058213267e-06, + "loss": 0.706, + "step": 3686 + }, + { + "epoch": 1.8295891771130695, + "grad_norm": 0.1291389182253454, + "learning_rate": 2.8419741693223607e-06, + "loss": 0.7156, + "step": 3687 + }, + { + "epoch": 1.8300856398163088, + "grad_norm": 0.13918286355290396, + "learning_rate": 2.8410059805581258e-06, + "loss": 0.7665, + "step": 3688 + }, + { + "epoch": 1.8305821025195481, + "grad_norm": 0.13286704143055944, + "learning_rate": 2.840037739676595e-06, + "loss": 0.6953, + "step": 3689 + }, + { + "epoch": 1.8310785652227877, + "grad_norm": 0.13464853923111927, + "learning_rate": 2.8390694468257474e-06, + "loss": 0.7558, + "step": 3690 + }, + { + "epoch": 1.8315750279260272, + "grad_norm": 0.1338921743633815, + "learning_rate": 2.838101102153572e-06, + "loss": 0.7216, + "step": 3691 + }, + { + "epoch": 1.8320714906292666, + "grad_norm": 0.1332544592794539, + "learning_rate": 2.8371327058080634e-06, + "loss": 0.7237, + "step": 3692 + }, + { + "epoch": 1.8325679533325059, + "grad_norm": 0.13859871403436683, + "learning_rate": 2.836164257937226e-06, + "loss": 0.6647, + "step": 3693 + }, + { + "epoch": 1.8330644160357452, + "grad_norm": 0.13773496966704152, + "learning_rate": 2.8351957586890724e-06, + "loss": 0.7558, + "step": 3694 + }, + { + "epoch": 1.8335608787389848, + "grad_norm": 0.1363040721397413, + "learning_rate": 2.8342272082116214e-06, + "loss": 0.7176, + "step": 3695 + }, + { + "epoch": 1.834057341442224, + "grad_norm": 0.13238261911259094, + "learning_rate": 2.833258606652901e-06, + "loss": 0.7428, + "step": 3696 + }, + { + "epoch": 1.8345538041454637, + "grad_norm": 0.1289742090498221, + "learning_rate": 2.8322899541609457e-06, + "loss": 0.7474, + "step": 3697 + }, + { + "epoch": 1.835050266848703, + "grad_norm": 0.13229223790941982, + "learning_rate": 2.8313212508837985e-06, + "loss": 0.6888, + "step": 3698 + }, + { + "epoch": 1.8355467295519423, + "grad_norm": 0.1405977314818644, + "learning_rate": 2.83035249696951e-06, + "loss": 0.7118, + "step": 3699 + }, + { + "epoch": 1.8360431922551819, + "grad_norm": 0.1322368134123888, + "learning_rate": 2.82938369256614e-06, + "loss": 0.7254, + "step": 3700 + }, + { + "epoch": 1.8365396549584212, + "grad_norm": 0.13239550569763764, + "learning_rate": 2.828414837821753e-06, + "loss": 0.7031, + "step": 3701 + }, + { + "epoch": 1.8370361176616608, + "grad_norm": 0.12443941139663224, + "learning_rate": 2.827445932884425e-06, + "loss": 0.6938, + "step": 3702 + }, + { + "epoch": 1.8375325803649, + "grad_norm": 0.13533098431151683, + "learning_rate": 2.8264769779022355e-06, + "loss": 0.715, + "step": 3703 + }, + { + "epoch": 1.8380290430681394, + "grad_norm": 0.13336648970273562, + "learning_rate": 2.8255079730232742e-06, + "loss": 0.7551, + "step": 3704 + }, + { + "epoch": 1.838525505771379, + "grad_norm": 0.12584694627355517, + "learning_rate": 2.8245389183956395e-06, + "loss": 0.686, + "step": 3705 + }, + { + "epoch": 1.8390219684746183, + "grad_norm": 0.1345026841457402, + "learning_rate": 2.8235698141674338e-06, + "loss": 0.7361, + "step": 3706 + }, + { + "epoch": 1.8395184311778578, + "grad_norm": 0.1303560374635088, + "learning_rate": 2.8226006604867705e-06, + "loss": 0.7158, + "step": 3707 + }, + { + "epoch": 1.8400148938810972, + "grad_norm": 0.13250398945920666, + "learning_rate": 2.821631457501769e-06, + "loss": 0.7667, + "step": 3708 + }, + { + "epoch": 1.8405113565843365, + "grad_norm": 0.13580387586544665, + "learning_rate": 2.820662205360555e-06, + "loss": 0.7681, + "step": 3709 + }, + { + "epoch": 1.841007819287576, + "grad_norm": 0.13301684404493566, + "learning_rate": 2.8196929042112652e-06, + "loss": 0.7285, + "step": 3710 + }, + { + "epoch": 1.8415042819908154, + "grad_norm": 0.13473692403172938, + "learning_rate": 2.818723554202041e-06, + "loss": 0.721, + "step": 3711 + }, + { + "epoch": 1.842000744694055, + "grad_norm": 0.12587991568709622, + "learning_rate": 2.817754155481032e-06, + "loss": 0.7165, + "step": 3712 + }, + { + "epoch": 1.8424972073972943, + "grad_norm": 0.13035348271632663, + "learning_rate": 2.816784708196395e-06, + "loss": 0.7185, + "step": 3713 + }, + { + "epoch": 1.8429936701005336, + "grad_norm": 0.15215075104364917, + "learning_rate": 2.815815212496294e-06, + "loss": 0.763, + "step": 3714 + }, + { + "epoch": 1.8434901328037732, + "grad_norm": 0.1373937835941121, + "learning_rate": 2.8148456685289016e-06, + "loss": 0.7423, + "step": 3715 + }, + { + "epoch": 1.8439865955070125, + "grad_norm": 0.12962189019847445, + "learning_rate": 2.813876076442397e-06, + "loss": 0.7229, + "step": 3716 + }, + { + "epoch": 1.844483058210252, + "grad_norm": 0.12465566308505892, + "learning_rate": 2.8129064363849674e-06, + "loss": 0.664, + "step": 3717 + }, + { + "epoch": 1.8449795209134914, + "grad_norm": 0.1320992278623956, + "learning_rate": 2.811936748504806e-06, + "loss": 0.7538, + "step": 3718 + }, + { + "epoch": 1.8454759836167307, + "grad_norm": 0.1332828335116305, + "learning_rate": 2.810967012950113e-06, + "loss": 0.7239, + "step": 3719 + }, + { + "epoch": 1.8459724463199703, + "grad_norm": 0.12797528135715117, + "learning_rate": 2.809997229869099e-06, + "loss": 0.7122, + "step": 3720 + }, + { + "epoch": 1.8464689090232096, + "grad_norm": 0.1301117350778891, + "learning_rate": 2.8090273994099793e-06, + "loss": 0.72, + "step": 3721 + }, + { + "epoch": 1.8469653717264491, + "grad_norm": 0.1300300122398293, + "learning_rate": 2.8080575217209756e-06, + "loss": 0.6907, + "step": 3722 + }, + { + "epoch": 1.8474618344296885, + "grad_norm": 0.1331300565652563, + "learning_rate": 2.8070875969503194e-06, + "loss": 0.7219, + "step": 3723 + }, + { + "epoch": 1.8479582971329278, + "grad_norm": 0.13587506563623994, + "learning_rate": 2.8061176252462473e-06, + "loss": 0.7145, + "step": 3724 + }, + { + "epoch": 1.8484547598361674, + "grad_norm": 0.12978747829512013, + "learning_rate": 2.805147606757005e-06, + "loss": 0.7188, + "step": 3725 + }, + { + "epoch": 1.8489512225394067, + "grad_norm": 0.12938008759831762, + "learning_rate": 2.804177541630843e-06, + "loss": 0.69, + "step": 3726 + }, + { + "epoch": 1.8494476852426462, + "grad_norm": 0.13496840351576017, + "learning_rate": 2.803207430016021e-06, + "loss": 0.7572, + "step": 3727 + }, + { + "epoch": 1.8499441479458856, + "grad_norm": 0.13064015668880996, + "learning_rate": 2.802237272060806e-06, + "loss": 0.7055, + "step": 3728 + }, + { + "epoch": 1.850440610649125, + "grad_norm": 0.1310236785495075, + "learning_rate": 2.8012670679134694e-06, + "loss": 0.6944, + "step": 3729 + }, + { + "epoch": 1.8509370733523642, + "grad_norm": 0.13058022532167907, + "learning_rate": 2.8002968177222916e-06, + "loss": 0.7276, + "step": 3730 + }, + { + "epoch": 1.8514335360556038, + "grad_norm": 0.1399741355065743, + "learning_rate": 2.7993265216355597e-06, + "loss": 0.7019, + "step": 3731 + }, + { + "epoch": 1.8519299987588433, + "grad_norm": 0.1309873590126447, + "learning_rate": 2.798356179801569e-06, + "loss": 0.7323, + "step": 3732 + }, + { + "epoch": 1.8524264614620827, + "grad_norm": 0.12823473606854824, + "learning_rate": 2.7973857923686192e-06, + "loss": 0.6766, + "step": 3733 + }, + { + "epoch": 1.852922924165322, + "grad_norm": 0.12599315475512168, + "learning_rate": 2.7964153594850207e-06, + "loss": 0.6611, + "step": 3734 + }, + { + "epoch": 1.8534193868685613, + "grad_norm": 0.13218616368397323, + "learning_rate": 2.7954448812990857e-06, + "loss": 0.7666, + "step": 3735 + }, + { + "epoch": 1.853915849571801, + "grad_norm": 0.13258167830517983, + "learning_rate": 2.7944743579591383e-06, + "loss": 0.7688, + "step": 3736 + }, + { + "epoch": 1.8544123122750404, + "grad_norm": 0.13636167071644384, + "learning_rate": 2.793503789613507e-06, + "loss": 0.7893, + "step": 3737 + }, + { + "epoch": 1.8549087749782798, + "grad_norm": 0.1353506593944718, + "learning_rate": 2.7925331764105272e-06, + "loss": 0.718, + "step": 3738 + }, + { + "epoch": 1.855405237681519, + "grad_norm": 0.13351328738260573, + "learning_rate": 2.791562518498542e-06, + "loss": 0.72, + "step": 3739 + }, + { + "epoch": 1.8559017003847584, + "grad_norm": 0.12916911002443757, + "learning_rate": 2.7905918160259005e-06, + "loss": 0.6834, + "step": 3740 + }, + { + "epoch": 1.856398163087998, + "grad_norm": 0.13034694869758517, + "learning_rate": 2.789621069140959e-06, + "loss": 0.7152, + "step": 3741 + }, + { + "epoch": 1.8568946257912375, + "grad_norm": 0.1358558324112095, + "learning_rate": 2.788650277992081e-06, + "loss": 0.7477, + "step": 3742 + }, + { + "epoch": 1.8573910884944769, + "grad_norm": 0.13072120609339014, + "learning_rate": 2.7876794427276362e-06, + "loss": 0.7296, + "step": 3743 + }, + { + "epoch": 1.8578875511977162, + "grad_norm": 0.12977265231141502, + "learning_rate": 2.786708563496002e-06, + "loss": 0.689, + "step": 3744 + }, + { + "epoch": 1.8583840139009555, + "grad_norm": 0.14176411747850756, + "learning_rate": 2.78573764044556e-06, + "loss": 0.7371, + "step": 3745 + }, + { + "epoch": 1.858880476604195, + "grad_norm": 0.13585066242740843, + "learning_rate": 2.7847666737247008e-06, + "loss": 0.7051, + "step": 3746 + }, + { + "epoch": 1.8593769393074346, + "grad_norm": 0.12837037510490845, + "learning_rate": 2.783795663481822e-06, + "loss": 0.7081, + "step": 3747 + }, + { + "epoch": 1.859873402010674, + "grad_norm": 0.13389138484666943, + "learning_rate": 2.7828246098653255e-06, + "loss": 0.7607, + "step": 3748 + }, + { + "epoch": 1.8603698647139133, + "grad_norm": 0.13565169731031765, + "learning_rate": 2.781853513023623e-06, + "loss": 0.7119, + "step": 3749 + }, + { + "epoch": 1.8608663274171526, + "grad_norm": 0.13181731856838821, + "learning_rate": 2.7808823731051306e-06, + "loss": 0.7253, + "step": 3750 + }, + { + "epoch": 1.8613627901203922, + "grad_norm": 0.1341896017701526, + "learning_rate": 2.7799111902582697e-06, + "loss": 0.6954, + "step": 3751 + }, + { + "epoch": 1.8618592528236317, + "grad_norm": 0.13618587231085355, + "learning_rate": 2.7789399646314723e-06, + "loss": 0.7345, + "step": 3752 + }, + { + "epoch": 1.862355715526871, + "grad_norm": 0.1369640920886463, + "learning_rate": 2.7779686963731738e-06, + "loss": 0.7393, + "step": 3753 + }, + { + "epoch": 1.8628521782301104, + "grad_norm": 0.13390573696400448, + "learning_rate": 2.7769973856318167e-06, + "loss": 0.725, + "step": 3754 + }, + { + "epoch": 1.8633486409333497, + "grad_norm": 0.1336592460566132, + "learning_rate": 2.7760260325558507e-06, + "loss": 0.6977, + "step": 3755 + }, + { + "epoch": 1.8638451036365893, + "grad_norm": 0.13604007586743042, + "learning_rate": 2.7750546372937315e-06, + "loss": 0.7183, + "step": 3756 + }, + { + "epoch": 1.8643415663398288, + "grad_norm": 0.1357385191072151, + "learning_rate": 2.774083199993921e-06, + "loss": 0.7517, + "step": 3757 + }, + { + "epoch": 1.8648380290430682, + "grad_norm": 0.1301840034604384, + "learning_rate": 2.7731117208048875e-06, + "loss": 0.6671, + "step": 3758 + }, + { + "epoch": 1.8653344917463075, + "grad_norm": 0.13571137517897333, + "learning_rate": 2.772140199875107e-06, + "loss": 0.7193, + "step": 3759 + }, + { + "epoch": 1.8658309544495468, + "grad_norm": 0.13744700754172479, + "learning_rate": 2.77116863735306e-06, + "loss": 0.7301, + "step": 3760 + }, + { + "epoch": 1.8663274171527864, + "grad_norm": 0.1338974951951887, + "learning_rate": 2.7701970333872354e-06, + "loss": 0.6994, + "step": 3761 + }, + { + "epoch": 1.866823879856026, + "grad_norm": 0.12864049194839253, + "learning_rate": 2.769225388126126e-06, + "loss": 0.693, + "step": 3762 + }, + { + "epoch": 1.8673203425592653, + "grad_norm": 0.13155044309844505, + "learning_rate": 2.7682537017182326e-06, + "loss": 0.721, + "step": 3763 + }, + { + "epoch": 1.8678168052625046, + "grad_norm": 0.1347463327038866, + "learning_rate": 2.767281974312062e-06, + "loss": 0.712, + "step": 3764 + }, + { + "epoch": 1.868313267965744, + "grad_norm": 0.12971495960250434, + "learning_rate": 2.7663102060561274e-06, + "loss": 0.7143, + "step": 3765 + }, + { + "epoch": 1.8688097306689835, + "grad_norm": 0.13510904366559584, + "learning_rate": 2.7653383970989477e-06, + "loss": 0.7057, + "step": 3766 + }, + { + "epoch": 1.869306193372223, + "grad_norm": 0.13401673104563622, + "learning_rate": 2.7643665475890484e-06, + "loss": 0.6864, + "step": 3767 + }, + { + "epoch": 1.8698026560754624, + "grad_norm": 0.13210648198821992, + "learning_rate": 2.763394657674961e-06, + "loss": 0.7038, + "step": 3768 + }, + { + "epoch": 1.8702991187787017, + "grad_norm": 0.1318377078194905, + "learning_rate": 2.762422727505224e-06, + "loss": 0.7596, + "step": 3769 + }, + { + "epoch": 1.870795581481941, + "grad_norm": 0.13025033485200868, + "learning_rate": 2.76145075722838e-06, + "loss": 0.7144, + "step": 3770 + }, + { + "epoch": 1.8712920441851806, + "grad_norm": 0.14491613825665298, + "learning_rate": 2.76047874699298e-06, + "loss": 0.7712, + "step": 3771 + }, + { + "epoch": 1.8717885068884201, + "grad_norm": 0.13563190298000607, + "learning_rate": 2.75950669694758e-06, + "loss": 0.7374, + "step": 3772 + }, + { + "epoch": 1.8722849695916595, + "grad_norm": 0.1320789962394144, + "learning_rate": 2.7585346072407422e-06, + "loss": 0.7426, + "step": 3773 + }, + { + "epoch": 1.8727814322948988, + "grad_norm": 0.13057853212026468, + "learning_rate": 2.757562478021035e-06, + "loss": 0.7262, + "step": 3774 + }, + { + "epoch": 1.8732778949981381, + "grad_norm": 0.13102963708589455, + "learning_rate": 2.756590309437033e-06, + "loss": 0.7011, + "step": 3775 + }, + { + "epoch": 1.8737743577013777, + "grad_norm": 0.13375423324978467, + "learning_rate": 2.755618101637315e-06, + "loss": 0.718, + "step": 3776 + }, + { + "epoch": 1.8742708204046172, + "grad_norm": 0.12830373485814348, + "learning_rate": 2.754645854770471e-06, + "loss": 0.7265, + "step": 3777 + }, + { + "epoch": 1.8747672831078566, + "grad_norm": 0.13336641643595246, + "learning_rate": 2.75367356898509e-06, + "loss": 0.6742, + "step": 3778 + }, + { + "epoch": 1.875263745811096, + "grad_norm": 0.13157972272739404, + "learning_rate": 2.7527012444297707e-06, + "loss": 0.7321, + "step": 3779 + }, + { + "epoch": 1.8757602085143352, + "grad_norm": 0.14208051021228643, + "learning_rate": 2.751728881253118e-06, + "loss": 0.7672, + "step": 3780 + }, + { + "epoch": 1.8762566712175748, + "grad_norm": 0.1371097076190842, + "learning_rate": 2.7507564796037424e-06, + "loss": 0.7512, + "step": 3781 + }, + { + "epoch": 1.8767531339208143, + "grad_norm": 0.13188702665160387, + "learning_rate": 2.7497840396302596e-06, + "loss": 0.7401, + "step": 3782 + }, + { + "epoch": 1.8772495966240537, + "grad_norm": 0.13500026817249772, + "learning_rate": 2.748811561481291e-06, + "loss": 0.7295, + "step": 3783 + }, + { + "epoch": 1.877746059327293, + "grad_norm": 0.13273560547309662, + "learning_rate": 2.7478390453054645e-06, + "loss": 0.7283, + "step": 3784 + }, + { + "epoch": 1.8782425220305323, + "grad_norm": 0.13872675617089372, + "learning_rate": 2.746866491251414e-06, + "loss": 0.7602, + "step": 3785 + }, + { + "epoch": 1.8787389847337719, + "grad_norm": 0.12924106818518485, + "learning_rate": 2.7458938994677784e-06, + "loss": 0.7546, + "step": 3786 + }, + { + "epoch": 1.8792354474370114, + "grad_norm": 0.136941377040484, + "learning_rate": 2.744921270103203e-06, + "loss": 0.7579, + "step": 3787 + }, + { + "epoch": 1.8797319101402508, + "grad_norm": 0.1298041641175948, + "learning_rate": 2.743948603306339e-06, + "loss": 0.7051, + "step": 3788 + }, + { + "epoch": 1.88022837284349, + "grad_norm": 0.13832860111903247, + "learning_rate": 2.7429758992258416e-06, + "loss": 0.7243, + "step": 3789 + }, + { + "epoch": 1.8807248355467294, + "grad_norm": 0.13402102344704708, + "learning_rate": 2.7420031580103736e-06, + "loss": 0.7225, + "step": 3790 + }, + { + "epoch": 1.881221298249969, + "grad_norm": 0.13987575442514208, + "learning_rate": 2.7410303798086034e-06, + "loss": 0.7264, + "step": 3791 + }, + { + "epoch": 1.8817177609532085, + "grad_norm": 0.13290550293386685, + "learning_rate": 2.7400575647692046e-06, + "loss": 0.7677, + "step": 3792 + }, + { + "epoch": 1.8822142236564479, + "grad_norm": 0.13083520400866375, + "learning_rate": 2.739084713040856e-06, + "loss": 0.7193, + "step": 3793 + }, + { + "epoch": 1.8827106863596872, + "grad_norm": 0.1282525167355114, + "learning_rate": 2.7381118247722427e-06, + "loss": 0.6802, + "step": 3794 + }, + { + "epoch": 1.8832071490629265, + "grad_norm": 0.13627486143137021, + "learning_rate": 2.7371389001120545e-06, + "loss": 0.722, + "step": 3795 + }, + { + "epoch": 1.883703611766166, + "grad_norm": 0.1277896117884158, + "learning_rate": 2.736165939208987e-06, + "loss": 0.6816, + "step": 3796 + }, + { + "epoch": 1.8842000744694056, + "grad_norm": 0.12853176516695636, + "learning_rate": 2.735192942211743e-06, + "loss": 0.7553, + "step": 3797 + }, + { + "epoch": 1.884696537172645, + "grad_norm": 0.13015691759857087, + "learning_rate": 2.7342199092690284e-06, + "loss": 0.7142, + "step": 3798 + }, + { + "epoch": 1.8851929998758843, + "grad_norm": 0.13097419516180478, + "learning_rate": 2.733246840529557e-06, + "loss": 0.7132, + "step": 3799 + }, + { + "epoch": 1.8856894625791236, + "grad_norm": 0.12724020311608158, + "learning_rate": 2.7322737361420454e-06, + "loss": 0.6748, + "step": 3800 + }, + { + "epoch": 1.8861859252823632, + "grad_norm": 0.1270428675400565, + "learning_rate": 2.7313005962552174e-06, + "loss": 0.7363, + "step": 3801 + }, + { + "epoch": 1.8866823879856027, + "grad_norm": 0.13592361098777284, + "learning_rate": 2.7303274210178023e-06, + "loss": 0.7309, + "step": 3802 + }, + { + "epoch": 1.887178850688842, + "grad_norm": 0.13079941165628042, + "learning_rate": 2.729354210578533e-06, + "loss": 0.7352, + "step": 3803 + }, + { + "epoch": 1.8876753133920814, + "grad_norm": 0.1340097748828976, + "learning_rate": 2.7283809650861508e-06, + "loss": 0.7285, + "step": 3804 + }, + { + "epoch": 1.8881717760953207, + "grad_norm": 0.13773420298973918, + "learning_rate": 2.727407684689399e-06, + "loss": 0.7649, + "step": 3805 + }, + { + "epoch": 1.8886682387985603, + "grad_norm": 0.133353759836882, + "learning_rate": 2.7264343695370294e-06, + "loss": 0.7464, + "step": 3806 + }, + { + "epoch": 1.8891647015017998, + "grad_norm": 0.131033603827758, + "learning_rate": 2.725461019777797e-06, + "loss": 0.6845, + "step": 3807 + }, + { + "epoch": 1.8896611642050392, + "grad_norm": 0.15770720974803792, + "learning_rate": 2.7244876355604627e-06, + "loss": 0.7314, + "step": 3808 + }, + { + "epoch": 1.8901576269082785, + "grad_norm": 0.1304863297812739, + "learning_rate": 2.723514217033793e-06, + "loss": 0.7471, + "step": 3809 + }, + { + "epoch": 1.8906540896115178, + "grad_norm": 0.13132658095809105, + "learning_rate": 2.722540764346559e-06, + "loss": 0.7382, + "step": 3810 + }, + { + "epoch": 1.8911505523147574, + "grad_norm": 0.13948668806141548, + "learning_rate": 2.7215672776475373e-06, + "loss": 0.7345, + "step": 3811 + }, + { + "epoch": 1.891647015017997, + "grad_norm": 0.14071216008468773, + "learning_rate": 2.720593757085509e-06, + "loss": 0.7849, + "step": 3812 + }, + { + "epoch": 1.8921434777212363, + "grad_norm": 0.1328645102350295, + "learning_rate": 2.719620202809262e-06, + "loss": 0.7345, + "step": 3813 + }, + { + "epoch": 1.8926399404244756, + "grad_norm": 0.1323492318218001, + "learning_rate": 2.718646614967589e-06, + "loss": 0.7121, + "step": 3814 + }, + { + "epoch": 1.893136403127715, + "grad_norm": 0.13311000745662738, + "learning_rate": 2.7176729937092868e-06, + "loss": 0.6856, + "step": 3815 + }, + { + "epoch": 1.8936328658309545, + "grad_norm": 0.14096801189996547, + "learning_rate": 2.716699339183157e-06, + "loss": 0.6663, + "step": 3816 + }, + { + "epoch": 1.894129328534194, + "grad_norm": 0.13592293501407796, + "learning_rate": 2.7157256515380075e-06, + "loss": 0.7389, + "step": 3817 + }, + { + "epoch": 1.8946257912374334, + "grad_norm": 0.13440467247038096, + "learning_rate": 2.7147519309226524e-06, + "loss": 0.7136, + "step": 3818 + }, + { + "epoch": 1.8951222539406727, + "grad_norm": 0.13477075791277746, + "learning_rate": 2.713778177485906e-06, + "loss": 0.7187, + "step": 3819 + }, + { + "epoch": 1.895618716643912, + "grad_norm": 0.1273414392999846, + "learning_rate": 2.712804391376594e-06, + "loss": 0.742, + "step": 3820 + }, + { + "epoch": 1.8961151793471516, + "grad_norm": 0.13460725669842077, + "learning_rate": 2.7118305727435433e-06, + "loss": 0.7034, + "step": 3821 + }, + { + "epoch": 1.8966116420503911, + "grad_norm": 0.13175344309138737, + "learning_rate": 2.710856721735585e-06, + "loss": 0.6716, + "step": 3822 + }, + { + "epoch": 1.8971081047536305, + "grad_norm": 0.12573529615472806, + "learning_rate": 2.709882838501558e-06, + "loss": 0.6647, + "step": 3823 + }, + { + "epoch": 1.8976045674568698, + "grad_norm": 0.1464775291640466, + "learning_rate": 2.7089089231903045e-06, + "loss": 0.7118, + "step": 3824 + }, + { + "epoch": 1.8981010301601091, + "grad_norm": 0.14724481437959286, + "learning_rate": 2.707934975950672e-06, + "loss": 0.7267, + "step": 3825 + }, + { + "epoch": 1.8985974928633487, + "grad_norm": 0.12701179330406961, + "learning_rate": 2.706960996931512e-06, + "loss": 0.7076, + "step": 3826 + }, + { + "epoch": 1.8990939555665882, + "grad_norm": 0.13539179556079792, + "learning_rate": 2.7059869862816817e-06, + "loss": 0.7247, + "step": 3827 + }, + { + "epoch": 1.8995904182698276, + "grad_norm": 0.13682853094889003, + "learning_rate": 2.7050129441500437e-06, + "loss": 0.7146, + "step": 3828 + }, + { + "epoch": 1.9000868809730669, + "grad_norm": 0.13008454352233842, + "learning_rate": 2.7040388706854636e-06, + "loss": 0.7115, + "step": 3829 + }, + { + "epoch": 1.9005833436763062, + "grad_norm": 0.13324370142235686, + "learning_rate": 2.703064766036814e-06, + "loss": 0.6941, + "step": 3830 + }, + { + "epoch": 1.9010798063795458, + "grad_norm": 0.1379864973843644, + "learning_rate": 2.7020906303529722e-06, + "loss": 0.7258, + "step": 3831 + }, + { + "epoch": 1.9015762690827853, + "grad_norm": 0.13430922945541224, + "learning_rate": 2.701116463782816e-06, + "loss": 0.717, + "step": 3832 + }, + { + "epoch": 1.9020727317860247, + "grad_norm": 0.13535178628287625, + "learning_rate": 2.7001422664752338e-06, + "loss": 0.7187, + "step": 3833 + }, + { + "epoch": 1.902569194489264, + "grad_norm": 0.13401050040469997, + "learning_rate": 2.6991680385791154e-06, + "loss": 0.6976, + "step": 3834 + }, + { + "epoch": 1.9030656571925033, + "grad_norm": 0.13110828507944752, + "learning_rate": 2.698193780243355e-06, + "loss": 0.7308, + "step": 3835 + }, + { + "epoch": 1.9035621198957429, + "grad_norm": 0.1330174198944991, + "learning_rate": 2.6972194916168533e-06, + "loss": 0.68, + "step": 3836 + }, + { + "epoch": 1.9040585825989822, + "grad_norm": 0.129123889581125, + "learning_rate": 2.696245172848515e-06, + "loss": 0.6816, + "step": 3837 + }, + { + "epoch": 1.9045550453022217, + "grad_norm": 0.1323170000848133, + "learning_rate": 2.6952708240872477e-06, + "loss": 0.74, + "step": 3838 + }, + { + "epoch": 1.905051508005461, + "grad_norm": 0.1281877570650443, + "learning_rate": 2.6942964454819663e-06, + "loss": 0.6716, + "step": 3839 + }, + { + "epoch": 1.9055479707087004, + "grad_norm": 0.1273111925333337, + "learning_rate": 2.693322037181588e-06, + "loss": 0.7161, + "step": 3840 + }, + { + "epoch": 1.90604443341194, + "grad_norm": 0.13411942656602813, + "learning_rate": 2.692347599335037e-06, + "loss": 0.7475, + "step": 3841 + }, + { + "epoch": 1.9065408961151793, + "grad_norm": 0.13465714368570397, + "learning_rate": 2.69137313209124e-06, + "loss": 0.6818, + "step": 3842 + }, + { + "epoch": 1.9070373588184188, + "grad_norm": 0.13693810595779668, + "learning_rate": 2.6903986355991267e-06, + "loss": 0.7217, + "step": 3843 + }, + { + "epoch": 1.9075338215216582, + "grad_norm": 0.13227383220776676, + "learning_rate": 2.6894241100076356e-06, + "loss": 0.7046, + "step": 3844 + }, + { + "epoch": 1.9080302842248975, + "grad_norm": 0.13024783516033986, + "learning_rate": 2.6884495554657057e-06, + "loss": 0.7341, + "step": 3845 + }, + { + "epoch": 1.908526746928137, + "grad_norm": 0.13571948071256487, + "learning_rate": 2.687474972122283e-06, + "loss": 0.7179, + "step": 3846 + }, + { + "epoch": 1.9090232096313764, + "grad_norm": 0.14165402752990086, + "learning_rate": 2.6865003601263177e-06, + "loss": 0.7488, + "step": 3847 + }, + { + "epoch": 1.909519672334616, + "grad_norm": 0.13217107325979766, + "learning_rate": 2.685525719626762e-06, + "loss": 0.7316, + "step": 3848 + }, + { + "epoch": 1.9100161350378553, + "grad_norm": 0.12981859327701908, + "learning_rate": 2.6845510507725747e-06, + "loss": 0.6838, + "step": 3849 + }, + { + "epoch": 1.9105125977410946, + "grad_norm": 0.13278332867245013, + "learning_rate": 2.6835763537127186e-06, + "loss": 0.698, + "step": 3850 + }, + { + "epoch": 1.9110090604443342, + "grad_norm": 0.12972846902968782, + "learning_rate": 2.68260162859616e-06, + "loss": 0.6937, + "step": 3851 + }, + { + "epoch": 1.9115055231475735, + "grad_norm": 0.13499670331712021, + "learning_rate": 2.68162687557187e-06, + "loss": 0.7544, + "step": 3852 + }, + { + "epoch": 1.912001985850813, + "grad_norm": 0.13281988396086394, + "learning_rate": 2.680652094788825e-06, + "loss": 0.7012, + "step": 3853 + }, + { + "epoch": 1.9124984485540524, + "grad_norm": 0.13640084140042943, + "learning_rate": 2.679677286396003e-06, + "loss": 0.7506, + "step": 3854 + }, + { + "epoch": 1.9129949112572917, + "grad_norm": 0.1385667059889805, + "learning_rate": 2.678702450542389e-06, + "loss": 0.7315, + "step": 3855 + }, + { + "epoch": 1.9134913739605313, + "grad_norm": 0.1340058683822914, + "learning_rate": 2.6777275873769703e-06, + "loss": 0.7702, + "step": 3856 + }, + { + "epoch": 1.9139878366637706, + "grad_norm": 0.13229770495593288, + "learning_rate": 2.67675269704874e-06, + "loss": 0.688, + "step": 3857 + }, + { + "epoch": 1.9144842993670101, + "grad_norm": 0.13565711456891463, + "learning_rate": 2.6757777797066947e-06, + "loss": 0.7322, + "step": 3858 + }, + { + "epoch": 1.9149807620702495, + "grad_norm": 0.12998843981576316, + "learning_rate": 2.6748028354998333e-06, + "loss": 0.7108, + "step": 3859 + }, + { + "epoch": 1.9154772247734888, + "grad_norm": 0.13608274463655146, + "learning_rate": 2.6738278645771615e-06, + "loss": 0.7161, + "step": 3860 + }, + { + "epoch": 1.9159736874767284, + "grad_norm": 0.13224537124708974, + "learning_rate": 2.6728528670876875e-06, + "loss": 0.6993, + "step": 3861 + }, + { + "epoch": 1.9164701501799677, + "grad_norm": 0.13440646467554213, + "learning_rate": 2.6718778431804243e-06, + "loss": 0.7041, + "step": 3862 + }, + { + "epoch": 1.9169666128832072, + "grad_norm": 0.12745811657851805, + "learning_rate": 2.670902793004389e-06, + "loss": 0.6779, + "step": 3863 + }, + { + "epoch": 1.9174630755864466, + "grad_norm": 0.13050712516771976, + "learning_rate": 2.6699277167086013e-06, + "loss": 0.7166, + "step": 3864 + }, + { + "epoch": 1.917959538289686, + "grad_norm": 0.12713091659389295, + "learning_rate": 2.668952614442087e-06, + "loss": 0.7017, + "step": 3865 + }, + { + "epoch": 1.9184560009929255, + "grad_norm": 0.135043325139347, + "learning_rate": 2.6679774863538747e-06, + "loss": 0.7295, + "step": 3866 + }, + { + "epoch": 1.9189524636961648, + "grad_norm": 0.13202671430433874, + "learning_rate": 2.667002332592997e-06, + "loss": 0.7315, + "step": 3867 + }, + { + "epoch": 1.9194489263994043, + "grad_norm": 0.13505500052720468, + "learning_rate": 2.6660271533084895e-06, + "loss": 0.7183, + "step": 3868 + }, + { + "epoch": 1.9199453891026437, + "grad_norm": 0.13089106796090635, + "learning_rate": 2.6650519486493955e-06, + "loss": 0.7629, + "step": 3869 + }, + { + "epoch": 1.920441851805883, + "grad_norm": 0.13514599448403475, + "learning_rate": 2.664076718764756e-06, + "loss": 0.7292, + "step": 3870 + }, + { + "epoch": 1.9209383145091226, + "grad_norm": 0.13758072791357281, + "learning_rate": 2.663101463803621e-06, + "loss": 0.7058, + "step": 3871 + }, + { + "epoch": 1.9214347772123619, + "grad_norm": 0.13451491052620126, + "learning_rate": 2.6621261839150426e-06, + "loss": 0.764, + "step": 3872 + }, + { + "epoch": 1.9219312399156014, + "grad_norm": 0.13120349619972196, + "learning_rate": 2.6611508792480763e-06, + "loss": 0.7192, + "step": 3873 + }, + { + "epoch": 1.9224277026188408, + "grad_norm": 0.1343506319273604, + "learning_rate": 2.6601755499517826e-06, + "loss": 0.7515, + "step": 3874 + }, + { + "epoch": 1.92292416532208, + "grad_norm": 0.13155915623807082, + "learning_rate": 2.6592001961752246e-06, + "loss": 0.695, + "step": 3875 + }, + { + "epoch": 1.9234206280253194, + "grad_norm": 0.12537341810788755, + "learning_rate": 2.658224818067468e-06, + "loss": 0.6914, + "step": 3876 + }, + { + "epoch": 1.923917090728559, + "grad_norm": 0.1298224456392821, + "learning_rate": 2.657249415777585e-06, + "loss": 0.7031, + "step": 3877 + }, + { + "epoch": 1.9244135534317985, + "grad_norm": 0.12831397033755135, + "learning_rate": 2.6562739894546507e-06, + "loss": 0.6985, + "step": 3878 + }, + { + "epoch": 1.9249100161350379, + "grad_norm": 0.13463509794079495, + "learning_rate": 2.6552985392477424e-06, + "loss": 0.7578, + "step": 3879 + }, + { + "epoch": 1.9254064788382772, + "grad_norm": 0.13466504296387913, + "learning_rate": 2.6543230653059427e-06, + "loss": 0.7357, + "step": 3880 + }, + { + "epoch": 1.9259029415415165, + "grad_norm": 0.13590866765903148, + "learning_rate": 2.6533475677783364e-06, + "loss": 0.7367, + "step": 3881 + }, + { + "epoch": 1.926399404244756, + "grad_norm": 0.13302016457236643, + "learning_rate": 2.652372046814014e-06, + "loss": 0.7737, + "step": 3882 + }, + { + "epoch": 1.9268958669479956, + "grad_norm": 0.13348758747050615, + "learning_rate": 2.651396502562067e-06, + "loss": 0.8016, + "step": 3883 + }, + { + "epoch": 1.927392329651235, + "grad_norm": 0.1327244870657118, + "learning_rate": 2.6504209351715914e-06, + "loss": 0.7173, + "step": 3884 + }, + { + "epoch": 1.9278887923544743, + "grad_norm": 0.12927521222084867, + "learning_rate": 2.6494453447916884e-06, + "loss": 0.6832, + "step": 3885 + }, + { + "epoch": 1.9283852550577136, + "grad_norm": 0.13386451158693466, + "learning_rate": 2.6484697315714602e-06, + "loss": 0.7336, + "step": 3886 + }, + { + "epoch": 1.9288817177609532, + "grad_norm": 0.1264106819143579, + "learning_rate": 2.6474940956600143e-06, + "loss": 0.6716, + "step": 3887 + }, + { + "epoch": 1.9293781804641927, + "grad_norm": 0.1280344281232393, + "learning_rate": 2.64651843720646e-06, + "loss": 0.7613, + "step": 3888 + }, + { + "epoch": 1.929874643167432, + "grad_norm": 0.13685165914605946, + "learning_rate": 2.6455427563599128e-06, + "loss": 0.7595, + "step": 3889 + }, + { + "epoch": 1.9303711058706714, + "grad_norm": 0.12802262711212847, + "learning_rate": 2.644567053269489e-06, + "loss": 0.7105, + "step": 3890 + }, + { + "epoch": 1.9308675685739107, + "grad_norm": 0.1401396120332125, + "learning_rate": 2.643591328084309e-06, + "loss": 0.7363, + "step": 3891 + }, + { + "epoch": 1.9313640312771503, + "grad_norm": 0.13423489071016823, + "learning_rate": 2.6426155809534958e-06, + "loss": 0.6799, + "step": 3892 + }, + { + "epoch": 1.9318604939803898, + "grad_norm": 0.125010992787907, + "learning_rate": 2.6416398120261782e-06, + "loss": 0.6502, + "step": 3893 + }, + { + "epoch": 1.9323569566836292, + "grad_norm": 0.1378318453962945, + "learning_rate": 2.6406640214514866e-06, + "loss": 0.7189, + "step": 3894 + }, + { + "epoch": 1.9328534193868685, + "grad_norm": 0.12766766267445206, + "learning_rate": 2.639688209378554e-06, + "loss": 0.6806, + "step": 3895 + }, + { + "epoch": 1.9333498820901078, + "grad_norm": 0.1328404296148707, + "learning_rate": 2.63871237595652e-06, + "loss": 0.716, + "step": 3896 + }, + { + "epoch": 1.9338463447933474, + "grad_norm": 0.13020525639573502, + "learning_rate": 2.6377365213345217e-06, + "loss": 0.6673, + "step": 3897 + }, + { + "epoch": 1.934342807496587, + "grad_norm": 0.12842653922306596, + "learning_rate": 2.6367606456617057e-06, + "loss": 0.6998, + "step": 3898 + }, + { + "epoch": 1.9348392701998263, + "grad_norm": 0.1363023679071453, + "learning_rate": 2.6357847490872176e-06, + "loss": 0.7224, + "step": 3899 + }, + { + "epoch": 1.9353357329030656, + "grad_norm": 0.1371628916129143, + "learning_rate": 2.634808831760207e-06, + "loss": 0.7227, + "step": 3900 + }, + { + "epoch": 1.935832195606305, + "grad_norm": 0.13510099814120166, + "learning_rate": 2.6338328938298287e-06, + "loss": 0.7311, + "step": 3901 + }, + { + "epoch": 1.9363286583095445, + "grad_norm": 0.1298081070212144, + "learning_rate": 2.632856935445238e-06, + "loss": 0.6655, + "step": 3902 + }, + { + "epoch": 1.936825121012784, + "grad_norm": 0.1289538644890483, + "learning_rate": 2.6318809567555946e-06, + "loss": 0.7062, + "step": 3903 + }, + { + "epoch": 1.9373215837160234, + "grad_norm": 0.13439353563363904, + "learning_rate": 2.630904957910062e-06, + "loss": 0.6991, + "step": 3904 + }, + { + "epoch": 1.9378180464192627, + "grad_norm": 0.13576155759044287, + "learning_rate": 2.6299289390578054e-06, + "loss": 0.7344, + "step": 3905 + }, + { + "epoch": 1.938314509122502, + "grad_norm": 0.1284387703639709, + "learning_rate": 2.628952900347994e-06, + "loss": 0.7029, + "step": 3906 + }, + { + "epoch": 1.9388109718257416, + "grad_norm": 0.12937400679523783, + "learning_rate": 2.6279768419297997e-06, + "loss": 0.6859, + "step": 3907 + }, + { + "epoch": 1.9393074345289811, + "grad_norm": 0.1317149945336735, + "learning_rate": 2.6270007639523966e-06, + "loss": 0.679, + "step": 3908 + }, + { + "epoch": 1.9398038972322205, + "grad_norm": 0.1306880273284504, + "learning_rate": 2.6260246665649623e-06, + "loss": 0.6934, + "step": 3909 + }, + { + "epoch": 1.9403003599354598, + "grad_norm": 0.13275523795140065, + "learning_rate": 2.625048549916679e-06, + "loss": 0.71, + "step": 3910 + }, + { + "epoch": 1.9407968226386991, + "grad_norm": 0.13269576003308478, + "learning_rate": 2.6240724141567296e-06, + "loss": 0.7458, + "step": 3911 + }, + { + "epoch": 1.9412932853419387, + "grad_norm": 0.12913224685659597, + "learning_rate": 2.6230962594343018e-06, + "loss": 0.639, + "step": 3912 + }, + { + "epoch": 1.9417897480451782, + "grad_norm": 0.13621237829923233, + "learning_rate": 2.622120085898584e-06, + "loss": 0.7167, + "step": 3913 + }, + { + "epoch": 1.9422862107484176, + "grad_norm": 0.13555011416798202, + "learning_rate": 2.6211438936987692e-06, + "loss": 0.7019, + "step": 3914 + }, + { + "epoch": 1.942782673451657, + "grad_norm": 0.13015371567538989, + "learning_rate": 2.620167682984052e-06, + "loss": 0.7581, + "step": 3915 + }, + { + "epoch": 1.9432791361548962, + "grad_norm": 0.14513175483219928, + "learning_rate": 2.6191914539036318e-06, + "loss": 0.7588, + "step": 3916 + }, + { + "epoch": 1.9437755988581358, + "grad_norm": 0.13652288010038582, + "learning_rate": 2.6182152066067095e-06, + "loss": 0.7355, + "step": 3917 + }, + { + "epoch": 1.9442720615613753, + "grad_norm": 0.14112063768896954, + "learning_rate": 2.6172389412424876e-06, + "loss": 0.8433, + "step": 3918 + }, + { + "epoch": 1.9447685242646147, + "grad_norm": 0.13310261496502976, + "learning_rate": 2.616262657960173e-06, + "loss": 0.749, + "step": 3919 + }, + { + "epoch": 1.945264986967854, + "grad_norm": 0.1298763438679353, + "learning_rate": 2.6152863569089754e-06, + "loss": 0.7002, + "step": 3920 + }, + { + "epoch": 1.9457614496710933, + "grad_norm": 0.13406943600819382, + "learning_rate": 2.614310038238107e-06, + "loss": 0.7198, + "step": 3921 + }, + { + "epoch": 1.9462579123743329, + "grad_norm": 0.1388915294171547, + "learning_rate": 2.613333702096782e-06, + "loss": 0.732, + "step": 3922 + }, + { + "epoch": 1.9467543750775724, + "grad_norm": 0.13098745963258238, + "learning_rate": 2.6123573486342185e-06, + "loss": 0.694, + "step": 3923 + }, + { + "epoch": 1.9472508377808118, + "grad_norm": 0.13673305400029112, + "learning_rate": 2.6113809779996344e-06, + "loss": 0.709, + "step": 3924 + }, + { + "epoch": 1.947747300484051, + "grad_norm": 0.13672731547591427, + "learning_rate": 2.610404590342254e-06, + "loss": 0.7119, + "step": 3925 + }, + { + "epoch": 1.9482437631872904, + "grad_norm": 0.12667903565039704, + "learning_rate": 2.6094281858113026e-06, + "loss": 0.7128, + "step": 3926 + }, + { + "epoch": 1.94874022589053, + "grad_norm": 0.1295518629610594, + "learning_rate": 2.6084517645560077e-06, + "loss": 0.7045, + "step": 3927 + }, + { + "epoch": 1.9492366885937695, + "grad_norm": 0.13912368621627139, + "learning_rate": 2.6074753267255994e-06, + "loss": 0.7286, + "step": 3928 + }, + { + "epoch": 1.9497331512970089, + "grad_norm": 0.1408437562224136, + "learning_rate": 2.60649887246931e-06, + "loss": 0.7566, + "step": 3929 + }, + { + "epoch": 1.9502296140002482, + "grad_norm": 0.1343158417044777, + "learning_rate": 2.605522401936376e-06, + "loss": 0.7252, + "step": 3930 + }, + { + "epoch": 1.9507260767034875, + "grad_norm": 0.1312175766834046, + "learning_rate": 2.604545915276035e-06, + "loss": 0.718, + "step": 3931 + }, + { + "epoch": 1.951222539406727, + "grad_norm": 0.1375316196872564, + "learning_rate": 2.6035694126375265e-06, + "loss": 0.7681, + "step": 3932 + }, + { + "epoch": 1.9517190021099666, + "grad_norm": 0.1347425466729892, + "learning_rate": 2.6025928941700945e-06, + "loss": 0.6754, + "step": 3933 + }, + { + "epoch": 1.952215464813206, + "grad_norm": 0.12903423135149714, + "learning_rate": 2.6016163600229832e-06, + "loss": 0.7674, + "step": 3934 + }, + { + "epoch": 1.9527119275164453, + "grad_norm": 0.13070824473355627, + "learning_rate": 2.6006398103454407e-06, + "loss": 0.6968, + "step": 3935 + }, + { + "epoch": 1.9532083902196846, + "grad_norm": 0.12638361456974517, + "learning_rate": 2.5996632452867167e-06, + "loss": 0.6966, + "step": 3936 + }, + { + "epoch": 1.9537048529229242, + "grad_norm": 0.13442945386349076, + "learning_rate": 2.5986866649960634e-06, + "loss": 0.7475, + "step": 3937 + }, + { + "epoch": 1.9542013156261637, + "grad_norm": 0.13647786271821394, + "learning_rate": 2.597710069622736e-06, + "loss": 0.6832, + "step": 3938 + }, + { + "epoch": 1.954697778329403, + "grad_norm": 0.1303452381637909, + "learning_rate": 2.596733459315992e-06, + "loss": 0.7067, + "step": 3939 + }, + { + "epoch": 1.9551942410326424, + "grad_norm": 0.1272985129582747, + "learning_rate": 2.595756834225089e-06, + "loss": 0.6699, + "step": 3940 + }, + { + "epoch": 1.9556907037358817, + "grad_norm": 0.12900166946504296, + "learning_rate": 2.594780194499289e-06, + "loss": 0.6893, + "step": 3941 + }, + { + "epoch": 1.9561871664391213, + "grad_norm": 0.13345773544468753, + "learning_rate": 2.593803540287856e-06, + "loss": 0.6645, + "step": 3942 + }, + { + "epoch": 1.9566836291423608, + "grad_norm": 0.12499172467109242, + "learning_rate": 2.592826871740056e-06, + "loss": 0.6568, + "step": 3943 + }, + { + "epoch": 1.9571800918456002, + "grad_norm": 0.1369899943691365, + "learning_rate": 2.5918501890051573e-06, + "loss": 0.7235, + "step": 3944 + }, + { + "epoch": 1.9576765545488395, + "grad_norm": 0.13445906303874938, + "learning_rate": 2.5908734922324293e-06, + "loss": 0.725, + "step": 3945 + }, + { + "epoch": 1.9581730172520788, + "grad_norm": 0.1620759841591545, + "learning_rate": 2.5898967815711455e-06, + "loss": 0.6952, + "step": 3946 + }, + { + "epoch": 1.9586694799553184, + "grad_norm": 0.1398600398265006, + "learning_rate": 2.58892005717058e-06, + "loss": 0.766, + "step": 3947 + }, + { + "epoch": 1.959165942658558, + "grad_norm": 0.13096623784090528, + "learning_rate": 2.5879433191800093e-06, + "loss": 0.7198, + "step": 3948 + }, + { + "epoch": 1.9596624053617973, + "grad_norm": 0.12657285136649368, + "learning_rate": 2.5869665677487122e-06, + "loss": 0.6872, + "step": 3949 + }, + { + "epoch": 1.9601588680650366, + "grad_norm": 0.13897203572470015, + "learning_rate": 2.58598980302597e-06, + "loss": 0.7355, + "step": 3950 + }, + { + "epoch": 1.960655330768276, + "grad_norm": 0.12963711745676562, + "learning_rate": 2.585013025161065e-06, + "loss": 0.6979, + "step": 3951 + }, + { + "epoch": 1.9611517934715155, + "grad_norm": 0.13783927250748948, + "learning_rate": 2.584036234303282e-06, + "loss": 0.7381, + "step": 3952 + }, + { + "epoch": 1.961648256174755, + "grad_norm": 0.13448371125196593, + "learning_rate": 2.583059430601908e-06, + "loss": 0.699, + "step": 3953 + }, + { + "epoch": 1.9621447188779944, + "grad_norm": 0.13318126568011363, + "learning_rate": 2.5820826142062323e-06, + "loss": 0.6785, + "step": 3954 + }, + { + "epoch": 1.9626411815812337, + "grad_norm": 0.13076288936075522, + "learning_rate": 2.581105785265545e-06, + "loss": 0.7204, + "step": 3955 + }, + { + "epoch": 1.963137644284473, + "grad_norm": 0.13117966572328443, + "learning_rate": 2.580128943929139e-06, + "loss": 0.7089, + "step": 3956 + }, + { + "epoch": 1.9636341069877126, + "grad_norm": 0.14643814504267247, + "learning_rate": 2.5791520903463076e-06, + "loss": 0.7698, + "step": 3957 + }, + { + "epoch": 1.9641305696909521, + "grad_norm": 0.13110955970838375, + "learning_rate": 2.578175224666349e-06, + "loss": 0.702, + "step": 3958 + }, + { + "epoch": 1.9646270323941915, + "grad_norm": 0.13211603472359895, + "learning_rate": 2.5771983470385604e-06, + "loss": 0.691, + "step": 3959 + }, + { + "epoch": 1.9651234950974308, + "grad_norm": 0.13433138088355598, + "learning_rate": 2.576221457612243e-06, + "loss": 0.7116, + "step": 3960 + }, + { + "epoch": 1.9656199578006701, + "grad_norm": 0.1329409392764817, + "learning_rate": 2.575244556536697e-06, + "loss": 0.7445, + "step": 3961 + }, + { + "epoch": 1.9661164205039097, + "grad_norm": 0.1314186532703457, + "learning_rate": 2.5742676439612283e-06, + "loss": 0.7041, + "step": 3962 + }, + { + "epoch": 1.9666128832071492, + "grad_norm": 0.13197553608649243, + "learning_rate": 2.5732907200351402e-06, + "loss": 0.7109, + "step": 3963 + }, + { + "epoch": 1.9671093459103886, + "grad_norm": 0.139383832666028, + "learning_rate": 2.5723137849077406e-06, + "loss": 0.7219, + "step": 3964 + }, + { + "epoch": 1.9676058086136279, + "grad_norm": 0.12890251354191545, + "learning_rate": 2.571336838728338e-06, + "loss": 0.7209, + "step": 3965 + }, + { + "epoch": 1.9681022713168672, + "grad_norm": 0.13068185566410134, + "learning_rate": 2.5703598816462443e-06, + "loss": 0.7415, + "step": 3966 + }, + { + "epoch": 1.9685987340201068, + "grad_norm": 0.13578478265192953, + "learning_rate": 2.5693829138107707e-06, + "loss": 0.7458, + "step": 3967 + }, + { + "epoch": 1.9690951967233463, + "grad_norm": 0.13252065296864216, + "learning_rate": 2.568405935371231e-06, + "loss": 0.7234, + "step": 3968 + }, + { + "epoch": 1.9695916594265857, + "grad_norm": 0.13323778860939306, + "learning_rate": 2.5674289464769405e-06, + "loss": 0.7013, + "step": 3969 + }, + { + "epoch": 1.970088122129825, + "grad_norm": 0.1287653472052357, + "learning_rate": 2.566451947277217e-06, + "loss": 0.6984, + "step": 3970 + }, + { + "epoch": 1.9705845848330643, + "grad_norm": 0.12933587377045058, + "learning_rate": 2.565474937921379e-06, + "loss": 0.7015, + "step": 3971 + }, + { + "epoch": 1.9710810475363039, + "grad_norm": 0.13330915382752193, + "learning_rate": 2.5644979185587466e-06, + "loss": 0.7413, + "step": 3972 + }, + { + "epoch": 1.9715775102395434, + "grad_norm": 0.1266644141116552, + "learning_rate": 2.5635208893386416e-06, + "loss": 0.6668, + "step": 3973 + }, + { + "epoch": 1.9720739729427827, + "grad_norm": 0.1394923566595831, + "learning_rate": 2.5625438504103863e-06, + "loss": 0.725, + "step": 3974 + }, + { + "epoch": 1.972570435646022, + "grad_norm": 0.1333230212803404, + "learning_rate": 2.5615668019233065e-06, + "loss": 0.7066, + "step": 3975 + }, + { + "epoch": 1.9730668983492614, + "grad_norm": 0.13786776101291032, + "learning_rate": 2.560589744026729e-06, + "loss": 0.7136, + "step": 3976 + }, + { + "epoch": 1.973563361052501, + "grad_norm": 0.13850183160275445, + "learning_rate": 2.5596126768699798e-06, + "loss": 0.6687, + "step": 3977 + }, + { + "epoch": 1.9740598237557403, + "grad_norm": 0.13732643590313048, + "learning_rate": 2.5586356006023894e-06, + "loss": 0.7616, + "step": 3978 + }, + { + "epoch": 1.9745562864589798, + "grad_norm": 0.13057844494817822, + "learning_rate": 2.5576585153732875e-06, + "loss": 0.6654, + "step": 3979 + }, + { + "epoch": 1.9750527491622192, + "grad_norm": 0.13328648064830395, + "learning_rate": 2.556681421332005e-06, + "loss": 0.7171, + "step": 3980 + }, + { + "epoch": 1.9755492118654585, + "grad_norm": 0.13209382988878132, + "learning_rate": 2.555704318627877e-06, + "loss": 0.7245, + "step": 3981 + }, + { + "epoch": 1.976045674568698, + "grad_norm": 0.1255898846757212, + "learning_rate": 2.5547272074102375e-06, + "loss": 0.6884, + "step": 3982 + }, + { + "epoch": 1.9765421372719374, + "grad_norm": 0.13542305325011644, + "learning_rate": 2.553750087828421e-06, + "loss": 0.7229, + "step": 3983 + }, + { + "epoch": 1.977038599975177, + "grad_norm": 0.13274636935908424, + "learning_rate": 2.552772960031765e-06, + "loss": 0.727, + "step": 3984 + }, + { + "epoch": 1.9775350626784163, + "grad_norm": 0.13087815752389365, + "learning_rate": 2.551795824169609e-06, + "loss": 0.6981, + "step": 3985 + }, + { + "epoch": 1.9780315253816556, + "grad_norm": 0.13311849040345575, + "learning_rate": 2.550818680391292e-06, + "loss": 0.7227, + "step": 3986 + }, + { + "epoch": 1.9785279880848952, + "grad_norm": 0.13708649225810834, + "learning_rate": 2.5498415288461537e-06, + "loss": 0.7279, + "step": 3987 + }, + { + "epoch": 1.9790244507881345, + "grad_norm": 0.13033779988678035, + "learning_rate": 2.548864369683538e-06, + "loss": 0.6814, + "step": 3988 + }, + { + "epoch": 1.979520913491374, + "grad_norm": 0.13116550526012039, + "learning_rate": 2.547887203052786e-06, + "loss": 0.7825, + "step": 3989 + }, + { + "epoch": 1.9800173761946134, + "grad_norm": 0.13081602543099036, + "learning_rate": 2.5469100291032423e-06, + "loss": 0.7239, + "step": 3990 + }, + { + "epoch": 1.9805138388978527, + "grad_norm": 0.13072879071258814, + "learning_rate": 2.545932847984254e-06, + "loss": 0.7104, + "step": 3991 + }, + { + "epoch": 1.9810103016010923, + "grad_norm": 0.13258224731686477, + "learning_rate": 2.5449556598451656e-06, + "loss": 0.6974, + "step": 3992 + }, + { + "epoch": 1.9815067643043316, + "grad_norm": 0.1351940871833718, + "learning_rate": 2.5439784648353256e-06, + "loss": 0.7572, + "step": 3993 + }, + { + "epoch": 1.9820032270075711, + "grad_norm": 0.14461540568622883, + "learning_rate": 2.543001263104083e-06, + "loss": 0.7281, + "step": 3994 + }, + { + "epoch": 1.9824996897108105, + "grad_norm": 0.13342177790858398, + "learning_rate": 2.5420240548007856e-06, + "loss": 0.7461, + "step": 3995 + }, + { + "epoch": 1.9829961524140498, + "grad_norm": 0.13543936141725377, + "learning_rate": 2.5410468400747858e-06, + "loss": 0.7591, + "step": 3996 + }, + { + "epoch": 1.9834926151172894, + "grad_norm": 0.1292242889450314, + "learning_rate": 2.5400696190754347e-06, + "loss": 0.7125, + "step": 3997 + }, + { + "epoch": 1.9839890778205287, + "grad_norm": 0.13366333141114367, + "learning_rate": 2.539092391952085e-06, + "loss": 0.7215, + "step": 3998 + }, + { + "epoch": 1.9844855405237682, + "grad_norm": 0.13947805335489766, + "learning_rate": 2.5381151588540896e-06, + "loss": 0.7721, + "step": 3999 + }, + { + "epoch": 1.9849820032270076, + "grad_norm": 0.13011957215342748, + "learning_rate": 2.537137919930803e-06, + "loss": 0.7054, + "step": 4000 + }, + { + "epoch": 1.985478465930247, + "grad_norm": 0.1350409911144854, + "learning_rate": 2.5361606753315814e-06, + "loss": 0.687, + "step": 4001 + }, + { + "epoch": 1.9859749286334865, + "grad_norm": 0.129620679858614, + "learning_rate": 2.5351834252057805e-06, + "loss": 0.7008, + "step": 4002 + }, + { + "epoch": 1.9864713913367258, + "grad_norm": 0.12678966363438363, + "learning_rate": 2.534206169702757e-06, + "loss": 0.6995, + "step": 4003 + }, + { + "epoch": 1.9869678540399653, + "grad_norm": 0.1318913991769552, + "learning_rate": 2.533228908971869e-06, + "loss": 0.734, + "step": 4004 + }, + { + "epoch": 1.9874643167432047, + "grad_norm": 0.13831865321707615, + "learning_rate": 2.532251643162475e-06, + "loss": 0.7618, + "step": 4005 + }, + { + "epoch": 1.987960779446444, + "grad_norm": 0.1349051666242995, + "learning_rate": 2.5312743724239336e-06, + "loss": 0.7522, + "step": 4006 + }, + { + "epoch": 1.9884572421496836, + "grad_norm": 0.13052404435468878, + "learning_rate": 2.5302970969056068e-06, + "loss": 0.6867, + "step": 4007 + }, + { + "epoch": 1.9889537048529229, + "grad_norm": 0.13818539051178105, + "learning_rate": 2.529319816756854e-06, + "loss": 0.733, + "step": 4008 + }, + { + "epoch": 1.9894501675561624, + "grad_norm": 0.12800876536122638, + "learning_rate": 2.5283425321270377e-06, + "loss": 0.6684, + "step": 4009 + }, + { + "epoch": 1.9899466302594018, + "grad_norm": 0.12765205661654766, + "learning_rate": 2.5273652431655204e-06, + "loss": 0.7032, + "step": 4010 + }, + { + "epoch": 1.990443092962641, + "grad_norm": 0.1338564783816033, + "learning_rate": 2.526387950021663e-06, + "loss": 0.7496, + "step": 4011 + }, + { + "epoch": 1.9909395556658807, + "grad_norm": 0.13415496043164832, + "learning_rate": 2.525410652844831e-06, + "loss": 0.7157, + "step": 4012 + }, + { + "epoch": 1.99143601836912, + "grad_norm": 0.1282775301010404, + "learning_rate": 2.524433351784389e-06, + "loss": 0.6783, + "step": 4013 + }, + { + "epoch": 1.9919324810723595, + "grad_norm": 0.13209341786586526, + "learning_rate": 2.523456046989701e-06, + "loss": 0.7353, + "step": 4014 + }, + { + "epoch": 1.9924289437755989, + "grad_norm": 0.1287222818009373, + "learning_rate": 2.5224787386101307e-06, + "loss": 0.6862, + "step": 4015 + }, + { + "epoch": 1.9929254064788382, + "grad_norm": 0.13035922850407525, + "learning_rate": 2.5215014267950465e-06, + "loss": 0.7004, + "step": 4016 + }, + { + "epoch": 1.9934218691820775, + "grad_norm": 0.131542341490382, + "learning_rate": 2.5205241116938137e-06, + "loss": 0.7147, + "step": 4017 + }, + { + "epoch": 1.993918331885317, + "grad_norm": 0.136610862731398, + "learning_rate": 2.5195467934558003e-06, + "loss": 0.7341, + "step": 4018 + }, + { + "epoch": 1.9944147945885566, + "grad_norm": 0.13012603782449922, + "learning_rate": 2.5185694722303728e-06, + "loss": 0.6945, + "step": 4019 + }, + { + "epoch": 1.994911257291796, + "grad_norm": 0.1325429269300131, + "learning_rate": 2.517592148166899e-06, + "loss": 0.7291, + "step": 4020 + }, + { + "epoch": 1.9954077199950353, + "grad_norm": 0.1297801158635372, + "learning_rate": 2.516614821414747e-06, + "loss": 0.6867, + "step": 4021 + }, + { + "epoch": 1.9959041826982746, + "grad_norm": 0.13181943517502306, + "learning_rate": 2.5156374921232862e-06, + "loss": 0.6634, + "step": 4022 + }, + { + "epoch": 1.9964006454015142, + "grad_norm": 0.13355813171588157, + "learning_rate": 2.5146601604418854e-06, + "loss": 0.7556, + "step": 4023 + }, + { + "epoch": 1.9968971081047537, + "grad_norm": 0.1310512552198657, + "learning_rate": 2.5136828265199143e-06, + "loss": 0.7198, + "step": 4024 + }, + { + "epoch": 1.997393570807993, + "grad_norm": 0.1272191857000951, + "learning_rate": 2.512705490506743e-06, + "loss": 0.7172, + "step": 4025 + }, + { + "epoch": 1.9978900335112324, + "grad_norm": 0.12766117521597195, + "learning_rate": 2.511728152551741e-06, + "loss": 0.724, + "step": 4026 + }, + { + "epoch": 1.9983864962144717, + "grad_norm": 0.13122724706133568, + "learning_rate": 2.5107508128042786e-06, + "loss": 0.7186, + "step": 4027 + }, + { + "epoch": 1.9988829589177113, + "grad_norm": 0.13170560779745644, + "learning_rate": 2.509773471413726e-06, + "loss": 0.7394, + "step": 4028 + }, + { + "epoch": 1.9993794216209508, + "grad_norm": 0.1279325800617962, + "learning_rate": 2.508796128529456e-06, + "loss": 0.6678, + "step": 4029 + }, + { + "epoch": 1.9998758843241902, + "grad_norm": 0.12867108507101366, + "learning_rate": 2.507818784300839e-06, + "loss": 0.6981, + "step": 4030 + }, + { + "epoch": 2.0, + "grad_norm": 0.12867108507101366, + "learning_rate": 2.5068414388772454e-06, + "loss": 0.191, + "step": 4031 + }, + { + "epoch": 2.0003723470274295, + "grad_norm": 0.13238445630417517, + "learning_rate": 2.505864092408048e-06, + "loss": 0.5359, + "step": 4032 + }, + { + "epoch": 2.0003723470274295, + "eval_loss": 0.7288655042648315, + "eval_runtime": 135.8088, + "eval_samples_per_second": 223.498, + "eval_steps_per_second": 27.944, + "step": 4032 + }, + { + "epoch": 2.0004964627032393, + "grad_norm": 0.13212599455112428, + "learning_rate": 2.504886745042618e-06, + "loss": 0.724, + "step": 4033 + }, + { + "epoch": 2.0009929254064787, + "grad_norm": 0.1289165900817434, + "learning_rate": 2.503909396930328e-06, + "loss": 0.6931, + "step": 4034 + }, + { + "epoch": 2.0014893881097184, + "grad_norm": 0.12817979511727562, + "learning_rate": 2.5029320482205487e-06, + "loss": 0.7045, + "step": 4035 + }, + { + "epoch": 2.0019858508129578, + "grad_norm": 0.13305215189225175, + "learning_rate": 2.501954699062653e-06, + "loss": 0.6943, + "step": 4036 + }, + { + "epoch": 2.002482313516197, + "grad_norm": 0.1293844522970027, + "learning_rate": 2.500977349606013e-06, + "loss": 0.7171, + "step": 4037 + }, + { + "epoch": 2.0029787762194364, + "grad_norm": 0.14606041921842416, + "learning_rate": 2.5e-06, + "loss": 0.759, + "step": 4038 + }, + { + "epoch": 2.0034752389226758, + "grad_norm": 0.12805929586454654, + "learning_rate": 2.499022650393988e-06, + "loss": 0.6875, + "step": 4039 + }, + { + "epoch": 2.0039717016259155, + "grad_norm": 0.13380230251599434, + "learning_rate": 2.4980453009373475e-06, + "loss": 0.7062, + "step": 4040 + }, + { + "epoch": 2.004468164329155, + "grad_norm": 0.12965303996516353, + "learning_rate": 2.497067951779452e-06, + "loss": 0.6996, + "step": 4041 + }, + { + "epoch": 2.004964627032394, + "grad_norm": 0.1295766633898341, + "learning_rate": 2.4960906030696727e-06, + "loss": 0.6684, + "step": 4042 + }, + { + "epoch": 2.0054610897356335, + "grad_norm": 0.1312131877586875, + "learning_rate": 2.495113254957382e-06, + "loss": 0.7207, + "step": 4043 + }, + { + "epoch": 2.005957552438873, + "grad_norm": 0.13817927580280212, + "learning_rate": 2.4941359075919523e-06, + "loss": 0.7428, + "step": 4044 + }, + { + "epoch": 2.0064540151421126, + "grad_norm": 0.136455142677692, + "learning_rate": 2.493158561122754e-06, + "loss": 0.6922, + "step": 4045 + }, + { + "epoch": 2.006950477845352, + "grad_norm": 0.13906822012805292, + "learning_rate": 2.492181215699162e-06, + "loss": 0.7097, + "step": 4046 + }, + { + "epoch": 2.0074469405485913, + "grad_norm": 0.12826980150415948, + "learning_rate": 2.4912038714705447e-06, + "loss": 0.6902, + "step": 4047 + }, + { + "epoch": 2.0079434032518306, + "grad_norm": 0.1388317530263215, + "learning_rate": 2.490226528586275e-06, + "loss": 0.7149, + "step": 4048 + }, + { + "epoch": 2.00843986595507, + "grad_norm": 0.13268448735817925, + "learning_rate": 2.489249187195723e-06, + "loss": 0.7098, + "step": 4049 + }, + { + "epoch": 2.0089363286583097, + "grad_norm": 0.1302092877003093, + "learning_rate": 2.4882718474482604e-06, + "loss": 0.7251, + "step": 4050 + }, + { + "epoch": 2.009432791361549, + "grad_norm": 0.13047139016142711, + "learning_rate": 2.487294509493258e-06, + "loss": 0.7105, + "step": 4051 + }, + { + "epoch": 2.0099292540647884, + "grad_norm": 0.13056451140753167, + "learning_rate": 2.4863171734800866e-06, + "loss": 0.6717, + "step": 4052 + }, + { + "epoch": 2.0104257167680277, + "grad_norm": 0.1272813781903442, + "learning_rate": 2.485339839558115e-06, + "loss": 0.6912, + "step": 4053 + }, + { + "epoch": 2.010922179471267, + "grad_norm": 0.1293306773850289, + "learning_rate": 2.4843625078767146e-06, + "loss": 0.6625, + "step": 4054 + }, + { + "epoch": 2.011418642174507, + "grad_norm": 0.13026414160725963, + "learning_rate": 2.4833851785852536e-06, + "loss": 0.7192, + "step": 4055 + }, + { + "epoch": 2.011915104877746, + "grad_norm": 0.13154534592494477, + "learning_rate": 2.4824078518331017e-06, + "loss": 0.7098, + "step": 4056 + }, + { + "epoch": 2.0124115675809855, + "grad_norm": 0.13413230046665645, + "learning_rate": 2.481430527769628e-06, + "loss": 0.7306, + "step": 4057 + }, + { + "epoch": 2.012908030284225, + "grad_norm": 0.13231043126436018, + "learning_rate": 2.4804532065442e-06, + "loss": 0.7263, + "step": 4058 + }, + { + "epoch": 2.013404492987464, + "grad_norm": 0.13114069790750638, + "learning_rate": 2.4794758883061862e-06, + "loss": 0.6813, + "step": 4059 + }, + { + "epoch": 2.013900955690704, + "grad_norm": 0.1330470419586042, + "learning_rate": 2.4784985732049535e-06, + "loss": 0.7061, + "step": 4060 + }, + { + "epoch": 2.0143974183939433, + "grad_norm": 0.1345085568471715, + "learning_rate": 2.4775212613898693e-06, + "loss": 0.7379, + "step": 4061 + }, + { + "epoch": 2.0148938810971826, + "grad_norm": 0.12782673695827584, + "learning_rate": 2.4765439530103004e-06, + "loss": 0.7046, + "step": 4062 + }, + { + "epoch": 2.015390343800422, + "grad_norm": 0.13254520200468667, + "learning_rate": 2.475566648215612e-06, + "loss": 0.7034, + "step": 4063 + }, + { + "epoch": 2.0158868065036613, + "grad_norm": 0.13807047247892, + "learning_rate": 2.4745893471551697e-06, + "loss": 0.6927, + "step": 4064 + }, + { + "epoch": 2.016383269206901, + "grad_norm": 0.12831463879417368, + "learning_rate": 2.4736120499783378e-06, + "loss": 0.6694, + "step": 4065 + }, + { + "epoch": 2.0168797319101404, + "grad_norm": 0.13082807641735877, + "learning_rate": 2.472634756834481e-06, + "loss": 0.6968, + "step": 4066 + }, + { + "epoch": 2.0173761946133797, + "grad_norm": 0.1274033272524088, + "learning_rate": 2.4716574678729627e-06, + "loss": 0.6998, + "step": 4067 + }, + { + "epoch": 2.017872657316619, + "grad_norm": 0.12742003523402126, + "learning_rate": 2.470680183243147e-06, + "loss": 0.6828, + "step": 4068 + }, + { + "epoch": 2.0183691200198584, + "grad_norm": 0.13490291668223267, + "learning_rate": 2.469702903094394e-06, + "loss": 0.7621, + "step": 4069 + }, + { + "epoch": 2.018865582723098, + "grad_norm": 0.13385281497853518, + "learning_rate": 2.4687256275760668e-06, + "loss": 0.7237, + "step": 4070 + }, + { + "epoch": 2.0193620454263375, + "grad_norm": 0.1360671850549657, + "learning_rate": 2.467748356837526e-06, + "loss": 0.7226, + "step": 4071 + }, + { + "epoch": 2.019858508129577, + "grad_norm": 0.13234055538841305, + "learning_rate": 2.4667710910281318e-06, + "loss": 0.7682, + "step": 4072 + }, + { + "epoch": 2.020354970832816, + "grad_norm": 0.12771711139534347, + "learning_rate": 2.465793830297244e-06, + "loss": 0.6369, + "step": 4073 + }, + { + "epoch": 2.0208514335360555, + "grad_norm": 0.13312672540262874, + "learning_rate": 2.4648165747942203e-06, + "loss": 0.7368, + "step": 4074 + }, + { + "epoch": 2.0213478962392952, + "grad_norm": 0.13105724105229263, + "learning_rate": 2.463839324668419e-06, + "loss": 0.7299, + "step": 4075 + }, + { + "epoch": 2.0218443589425346, + "grad_norm": 0.13275204495209755, + "learning_rate": 2.462862080069197e-06, + "loss": 0.7286, + "step": 4076 + }, + { + "epoch": 2.022340821645774, + "grad_norm": 0.13761097722270485, + "learning_rate": 2.461884841145911e-06, + "loss": 0.725, + "step": 4077 + }, + { + "epoch": 2.022837284349013, + "grad_norm": 0.13174698392561143, + "learning_rate": 2.460907608047916e-06, + "loss": 0.6977, + "step": 4078 + }, + { + "epoch": 2.0233337470522526, + "grad_norm": 0.14004069015911763, + "learning_rate": 2.459930380924566e-06, + "loss": 0.7458, + "step": 4079 + }, + { + "epoch": 2.0238302097554923, + "grad_norm": 0.129016892174802, + "learning_rate": 2.4589531599252155e-06, + "loss": 0.6907, + "step": 4080 + }, + { + "epoch": 2.0243266724587317, + "grad_norm": 0.13159681156179226, + "learning_rate": 2.4579759451992157e-06, + "loss": 0.7164, + "step": 4081 + }, + { + "epoch": 2.024823135161971, + "grad_norm": 0.13167520844662808, + "learning_rate": 2.4569987368959186e-06, + "loss": 0.7421, + "step": 4082 + }, + { + "epoch": 2.0253195978652103, + "grad_norm": 0.1363803013638871, + "learning_rate": 2.4560215351646752e-06, + "loss": 0.7367, + "step": 4083 + }, + { + "epoch": 2.0258160605684497, + "grad_norm": 0.12838138545946867, + "learning_rate": 2.4550443401548348e-06, + "loss": 0.7207, + "step": 4084 + }, + { + "epoch": 2.0263125232716894, + "grad_norm": 0.1316322999014349, + "learning_rate": 2.4540671520157474e-06, + "loss": 0.7001, + "step": 4085 + }, + { + "epoch": 2.0268089859749288, + "grad_norm": 0.13525722121983028, + "learning_rate": 2.453089970896758e-06, + "loss": 0.7593, + "step": 4086 + }, + { + "epoch": 2.027305448678168, + "grad_norm": 0.1305004559151505, + "learning_rate": 2.452112796947215e-06, + "loss": 0.6767, + "step": 4087 + }, + { + "epoch": 2.0278019113814074, + "grad_norm": 0.13465746531659445, + "learning_rate": 2.451135630316463e-06, + "loss": 0.7024, + "step": 4088 + }, + { + "epoch": 2.0282983740846467, + "grad_norm": 0.12955552855480887, + "learning_rate": 2.4501584711538467e-06, + "loss": 0.6726, + "step": 4089 + }, + { + "epoch": 2.0287948367878865, + "grad_norm": 0.1320046748116342, + "learning_rate": 2.4491813196087087e-06, + "loss": 0.7145, + "step": 4090 + }, + { + "epoch": 2.029291299491126, + "grad_norm": 0.1426680668898575, + "learning_rate": 2.4482041758303914e-06, + "loss": 0.7211, + "step": 4091 + }, + { + "epoch": 2.029787762194365, + "grad_norm": 0.13091969364968287, + "learning_rate": 2.447227039968235e-06, + "loss": 0.6937, + "step": 4092 + }, + { + "epoch": 2.0302842248976045, + "grad_norm": 0.13053571724955956, + "learning_rate": 2.4462499121715794e-06, + "loss": 0.7057, + "step": 4093 + }, + { + "epoch": 2.030780687600844, + "grad_norm": 0.13244995057683245, + "learning_rate": 2.4452727925897633e-06, + "loss": 0.715, + "step": 4094 + }, + { + "epoch": 2.0312771503040836, + "grad_norm": 0.12980740024168078, + "learning_rate": 2.4442956813721235e-06, + "loss": 0.6907, + "step": 4095 + }, + { + "epoch": 2.031773613007323, + "grad_norm": 0.1320072156478787, + "learning_rate": 2.443318578667996e-06, + "loss": 0.7048, + "step": 4096 + }, + { + "epoch": 2.0322700757105623, + "grad_norm": 0.13288927327614064, + "learning_rate": 2.442341484626714e-06, + "loss": 0.6985, + "step": 4097 + }, + { + "epoch": 2.0327665384138016, + "grad_norm": 0.13044804570837723, + "learning_rate": 2.4413643993976114e-06, + "loss": 0.7046, + "step": 4098 + }, + { + "epoch": 2.033263001117041, + "grad_norm": 0.12860176814635504, + "learning_rate": 2.4403873231300206e-06, + "loss": 0.6514, + "step": 4099 + }, + { + "epoch": 2.0337594638202807, + "grad_norm": 0.12841049842685956, + "learning_rate": 2.4394102559732717e-06, + "loss": 0.7143, + "step": 4100 + }, + { + "epoch": 2.03425592652352, + "grad_norm": 0.1383324791180656, + "learning_rate": 2.438433198076694e-06, + "loss": 0.7236, + "step": 4101 + }, + { + "epoch": 2.0347523892267594, + "grad_norm": 0.1334274425685486, + "learning_rate": 2.437456149589614e-06, + "loss": 0.717, + "step": 4102 + }, + { + "epoch": 2.0352488519299987, + "grad_norm": 0.1328619959886049, + "learning_rate": 2.4364791106613596e-06, + "loss": 0.7392, + "step": 4103 + }, + { + "epoch": 2.035745314633238, + "grad_norm": 0.12837963819339782, + "learning_rate": 2.435502081441254e-06, + "loss": 0.6838, + "step": 4104 + }, + { + "epoch": 2.0362417773364774, + "grad_norm": 0.1314972595674822, + "learning_rate": 2.434525062078622e-06, + "loss": 0.7104, + "step": 4105 + }, + { + "epoch": 2.036738240039717, + "grad_norm": 0.1376872036291365, + "learning_rate": 2.4335480527227833e-06, + "loss": 0.7023, + "step": 4106 + }, + { + "epoch": 2.0372347027429565, + "grad_norm": 0.12746910204547873, + "learning_rate": 2.43257105352306e-06, + "loss": 0.7065, + "step": 4107 + }, + { + "epoch": 2.037731165446196, + "grad_norm": 0.12941269428669738, + "learning_rate": 2.4315940646287693e-06, + "loss": 0.6786, + "step": 4108 + }, + { + "epoch": 2.038227628149435, + "grad_norm": 0.1334515747366998, + "learning_rate": 2.4306170861892293e-06, + "loss": 0.7258, + "step": 4109 + }, + { + "epoch": 2.0387240908526745, + "grad_norm": 0.13098506524365233, + "learning_rate": 2.429640118353756e-06, + "loss": 0.6948, + "step": 4110 + }, + { + "epoch": 2.0392205535559143, + "grad_norm": 0.1314973323181186, + "learning_rate": 2.4286631612716623e-06, + "loss": 0.6943, + "step": 4111 + }, + { + "epoch": 2.0397170162591536, + "grad_norm": 0.12965431662554677, + "learning_rate": 2.427686215092261e-06, + "loss": 0.6807, + "step": 4112 + }, + { + "epoch": 2.040213478962393, + "grad_norm": 0.13024307058284337, + "learning_rate": 2.426709279964861e-06, + "loss": 0.7308, + "step": 4113 + }, + { + "epoch": 2.0407099416656322, + "grad_norm": 0.13033943720971095, + "learning_rate": 2.425732356038773e-06, + "loss": 0.6952, + "step": 4114 + }, + { + "epoch": 2.0412064043688716, + "grad_norm": 0.12922563196874387, + "learning_rate": 2.424755443463303e-06, + "loss": 0.7066, + "step": 4115 + }, + { + "epoch": 2.0417028670721113, + "grad_norm": 0.13643925819128516, + "learning_rate": 2.4237785423877576e-06, + "loss": 0.7388, + "step": 4116 + }, + { + "epoch": 2.0421993297753507, + "grad_norm": 0.1285865306223446, + "learning_rate": 2.42280165296144e-06, + "loss": 0.753, + "step": 4117 + }, + { + "epoch": 2.04269579247859, + "grad_norm": 0.12885979273302448, + "learning_rate": 2.421824775333652e-06, + "loss": 0.6723, + "step": 4118 + }, + { + "epoch": 2.0431922551818293, + "grad_norm": 0.13298912364217827, + "learning_rate": 2.420847909653693e-06, + "loss": 0.7522, + "step": 4119 + }, + { + "epoch": 2.0436887178850687, + "grad_norm": 0.12918140250053928, + "learning_rate": 2.4198710560708623e-06, + "loss": 0.6887, + "step": 4120 + }, + { + "epoch": 2.0441851805883084, + "grad_norm": 0.1298629305623038, + "learning_rate": 2.4188942147344557e-06, + "loss": 0.6918, + "step": 4121 + }, + { + "epoch": 2.044681643291548, + "grad_norm": 0.1329165561864215, + "learning_rate": 2.4179173857937686e-06, + "loss": 0.7004, + "step": 4122 + }, + { + "epoch": 2.045178105994787, + "grad_norm": 0.13353203604650155, + "learning_rate": 2.4169405693980926e-06, + "loss": 0.6686, + "step": 4123 + }, + { + "epoch": 2.0456745686980264, + "grad_norm": 0.12807440054179015, + "learning_rate": 2.4159637656967185e-06, + "loss": 0.6834, + "step": 4124 + }, + { + "epoch": 2.0461710314012658, + "grad_norm": 0.13435476970975527, + "learning_rate": 2.4149869748389355e-06, + "loss": 0.713, + "step": 4125 + }, + { + "epoch": 2.0466674941045055, + "grad_norm": 0.13449883152953462, + "learning_rate": 2.4140101969740305e-06, + "loss": 0.7422, + "step": 4126 + }, + { + "epoch": 2.047163956807745, + "grad_norm": 0.12884574289334033, + "learning_rate": 2.413033432251289e-06, + "loss": 0.7106, + "step": 4127 + }, + { + "epoch": 2.047660419510984, + "grad_norm": 0.13241382070645977, + "learning_rate": 2.412056680819992e-06, + "loss": 0.7156, + "step": 4128 + }, + { + "epoch": 2.0481568822142235, + "grad_norm": 0.13077843297811131, + "learning_rate": 2.4110799428294214e-06, + "loss": 0.669, + "step": 4129 + }, + { + "epoch": 2.048653344917463, + "grad_norm": 0.12749052274478226, + "learning_rate": 2.4101032184288558e-06, + "loss": 0.6908, + "step": 4130 + }, + { + "epoch": 2.0491498076207026, + "grad_norm": 0.12836437459659322, + "learning_rate": 2.4091265077675716e-06, + "loss": 0.7328, + "step": 4131 + }, + { + "epoch": 2.049646270323942, + "grad_norm": 0.13191584290268923, + "learning_rate": 2.4081498109948435e-06, + "loss": 0.7269, + "step": 4132 + }, + { + "epoch": 2.0501427330271813, + "grad_norm": 0.1336022422875839, + "learning_rate": 2.407173128259945e-06, + "loss": 0.746, + "step": 4133 + }, + { + "epoch": 2.0506391957304206, + "grad_norm": 0.12850600170285384, + "learning_rate": 2.406196459712145e-06, + "loss": 0.6999, + "step": 4134 + }, + { + "epoch": 2.05113565843366, + "grad_norm": 0.1280500632235552, + "learning_rate": 2.4052198055007117e-06, + "loss": 0.7095, + "step": 4135 + }, + { + "epoch": 2.0516321211368997, + "grad_norm": 0.13308799322438067, + "learning_rate": 2.404243165774912e-06, + "loss": 0.7411, + "step": 4136 + }, + { + "epoch": 2.052128583840139, + "grad_norm": 0.12920055964205324, + "learning_rate": 2.4032665406840084e-06, + "loss": 0.6944, + "step": 4137 + }, + { + "epoch": 2.0526250465433784, + "grad_norm": 0.1349853449657708, + "learning_rate": 2.402289930377264e-06, + "loss": 0.7414, + "step": 4138 + }, + { + "epoch": 2.0531215092466177, + "grad_norm": 0.12839001848947515, + "learning_rate": 2.4013133350039366e-06, + "loss": 0.6611, + "step": 4139 + }, + { + "epoch": 2.053617971949857, + "grad_norm": 0.12960386307340557, + "learning_rate": 2.4003367547132833e-06, + "loss": 0.6837, + "step": 4140 + }, + { + "epoch": 2.054114434653097, + "grad_norm": 0.12992168154334086, + "learning_rate": 2.3993601896545593e-06, + "loss": 0.7583, + "step": 4141 + }, + { + "epoch": 2.054610897356336, + "grad_norm": 0.13162250535061173, + "learning_rate": 2.398383639977017e-06, + "loss": 0.6762, + "step": 4142 + }, + { + "epoch": 2.0551073600595755, + "grad_norm": 0.1322234940484418, + "learning_rate": 2.3974071058299063e-06, + "loss": 0.6959, + "step": 4143 + }, + { + "epoch": 2.055603822762815, + "grad_norm": 0.13363673998654405, + "learning_rate": 2.3964305873624748e-06, + "loss": 0.6872, + "step": 4144 + }, + { + "epoch": 2.056100285466054, + "grad_norm": 0.1315605491232442, + "learning_rate": 2.3954540847239663e-06, + "loss": 0.7403, + "step": 4145 + }, + { + "epoch": 2.056596748169294, + "grad_norm": 0.13173975211915448, + "learning_rate": 2.394477598063625e-06, + "loss": 0.7021, + "step": 4146 + }, + { + "epoch": 2.0570932108725333, + "grad_norm": 0.12934541651577286, + "learning_rate": 2.3935011275306907e-06, + "loss": 0.6995, + "step": 4147 + }, + { + "epoch": 2.0575896735757726, + "grad_norm": 0.1337787954167388, + "learning_rate": 2.3925246732744014e-06, + "loss": 0.6788, + "step": 4148 + }, + { + "epoch": 2.058086136279012, + "grad_norm": 0.1409884703911019, + "learning_rate": 2.3915482354439935e-06, + "loss": 0.6618, + "step": 4149 + }, + { + "epoch": 2.0585825989822513, + "grad_norm": 0.13863027742696146, + "learning_rate": 2.390571814188698e-06, + "loss": 0.6907, + "step": 4150 + }, + { + "epoch": 2.059079061685491, + "grad_norm": 0.1264859190593734, + "learning_rate": 2.3895954096577466e-06, + "loss": 0.715, + "step": 4151 + }, + { + "epoch": 2.0595755243887304, + "grad_norm": 0.12838817948703266, + "learning_rate": 2.388619022000366e-06, + "loss": 0.6928, + "step": 4152 + }, + { + "epoch": 2.0600719870919697, + "grad_norm": 0.1382264776368354, + "learning_rate": 2.3876426513657823e-06, + "loss": 0.7077, + "step": 4153 + }, + { + "epoch": 2.060568449795209, + "grad_norm": 0.13252726490660227, + "learning_rate": 2.3866662979032183e-06, + "loss": 0.6868, + "step": 4154 + }, + { + "epoch": 2.0610649124984484, + "grad_norm": 0.13082652966537198, + "learning_rate": 2.385689961761893e-06, + "loss": 0.684, + "step": 4155 + }, + { + "epoch": 2.061561375201688, + "grad_norm": 0.1330519426707675, + "learning_rate": 2.384713643091025e-06, + "loss": 0.6788, + "step": 4156 + }, + { + "epoch": 2.0620578379049275, + "grad_norm": 0.12951338154381573, + "learning_rate": 2.3837373420398274e-06, + "loss": 0.7167, + "step": 4157 + }, + { + "epoch": 2.062554300608167, + "grad_norm": 0.12959226185363074, + "learning_rate": 2.382761058757513e-06, + "loss": 0.6929, + "step": 4158 + }, + { + "epoch": 2.063050763311406, + "grad_norm": 0.13289167957939557, + "learning_rate": 2.381784793393292e-06, + "loss": 0.7145, + "step": 4159 + }, + { + "epoch": 2.0635472260146455, + "grad_norm": 0.13354521709268397, + "learning_rate": 2.3808085460963686e-06, + "loss": 0.6988, + "step": 4160 + }, + { + "epoch": 2.0640436887178852, + "grad_norm": 0.13397190629908656, + "learning_rate": 2.3798323170159487e-06, + "loss": 0.7283, + "step": 4161 + }, + { + "epoch": 2.0645401514211246, + "grad_norm": 0.13299819686134604, + "learning_rate": 2.378856106301232e-06, + "loss": 0.7049, + "step": 4162 + }, + { + "epoch": 2.065036614124364, + "grad_norm": 0.1288123646274204, + "learning_rate": 2.377879914101417e-06, + "loss": 0.7245, + "step": 4163 + }, + { + "epoch": 2.0655330768276032, + "grad_norm": 0.1393615916081668, + "learning_rate": 2.376903740565699e-06, + "loss": 0.6912, + "step": 4164 + }, + { + "epoch": 2.0660295395308426, + "grad_norm": 0.12457000357374992, + "learning_rate": 2.375927585843271e-06, + "loss": 0.6801, + "step": 4165 + }, + { + "epoch": 2.0665260022340823, + "grad_norm": 0.12940223030790443, + "learning_rate": 2.3749514500833218e-06, + "loss": 0.7121, + "step": 4166 + }, + { + "epoch": 2.0670224649373217, + "grad_norm": 0.1382124046186019, + "learning_rate": 2.373975333435038e-06, + "loss": 0.7042, + "step": 4167 + }, + { + "epoch": 2.067518927640561, + "grad_norm": 0.13543223125625606, + "learning_rate": 2.3729992360476047e-06, + "loss": 0.6908, + "step": 4168 + }, + { + "epoch": 2.0680153903438003, + "grad_norm": 0.1367570277824597, + "learning_rate": 2.372023158070201e-06, + "loss": 0.7385, + "step": 4169 + }, + { + "epoch": 2.0685118530470397, + "grad_norm": 0.12658280910237, + "learning_rate": 2.3710470996520067e-06, + "loss": 0.6485, + "step": 4170 + }, + { + "epoch": 2.0690083157502794, + "grad_norm": 0.1318360238221834, + "learning_rate": 2.370071060942195e-06, + "loss": 0.7073, + "step": 4171 + }, + { + "epoch": 2.0695047784535188, + "grad_norm": 0.1355974621244026, + "learning_rate": 2.369095042089938e-06, + "loss": 0.7383, + "step": 4172 + }, + { + "epoch": 2.070001241156758, + "grad_norm": 0.1354835108761855, + "learning_rate": 2.368119043244405e-06, + "loss": 0.7249, + "step": 4173 + }, + { + "epoch": 2.0704977038599974, + "grad_norm": 0.1316834160537684, + "learning_rate": 2.3671430645547622e-06, + "loss": 0.718, + "step": 4174 + }, + { + "epoch": 2.0709941665632368, + "grad_norm": 0.1337398224105994, + "learning_rate": 2.3661671061701725e-06, + "loss": 0.7617, + "step": 4175 + }, + { + "epoch": 2.0714906292664765, + "grad_norm": 0.13071098977935075, + "learning_rate": 2.3651911682397937e-06, + "loss": 0.7205, + "step": 4176 + }, + { + "epoch": 2.071987091969716, + "grad_norm": 0.13108332634784578, + "learning_rate": 2.3642152509127837e-06, + "loss": 0.7222, + "step": 4177 + }, + { + "epoch": 2.072483554672955, + "grad_norm": 0.12917550227439295, + "learning_rate": 2.363239354338295e-06, + "loss": 0.6863, + "step": 4178 + }, + { + "epoch": 2.0729800173761945, + "grad_norm": 0.13058903950875422, + "learning_rate": 2.3622634786654787e-06, + "loss": 0.741, + "step": 4179 + }, + { + "epoch": 2.073476480079434, + "grad_norm": 0.1317101832126491, + "learning_rate": 2.361287624043481e-06, + "loss": 0.69, + "step": 4180 + }, + { + "epoch": 2.0739729427826736, + "grad_norm": 0.13179285865877882, + "learning_rate": 2.3603117906214463e-06, + "loss": 0.6874, + "step": 4181 + }, + { + "epoch": 2.074469405485913, + "grad_norm": 0.13074448118330892, + "learning_rate": 2.3593359785485143e-06, + "loss": 0.7324, + "step": 4182 + }, + { + "epoch": 2.0749658681891523, + "grad_norm": 0.13408000673376183, + "learning_rate": 2.358360187973822e-06, + "loss": 0.7242, + "step": 4183 + }, + { + "epoch": 2.0754623308923916, + "grad_norm": 0.13011307891286955, + "learning_rate": 2.3573844190465046e-06, + "loss": 0.6941, + "step": 4184 + }, + { + "epoch": 2.075958793595631, + "grad_norm": 0.12572911313821467, + "learning_rate": 2.356408671915692e-06, + "loss": 0.6861, + "step": 4185 + }, + { + "epoch": 2.0764552562988707, + "grad_norm": 0.13279434671974327, + "learning_rate": 2.355432946730512e-06, + "loss": 0.7343, + "step": 4186 + }, + { + "epoch": 2.07695171900211, + "grad_norm": 0.1375180906665279, + "learning_rate": 2.3544572436400876e-06, + "loss": 0.7285, + "step": 4187 + }, + { + "epoch": 2.0774481817053494, + "grad_norm": 0.13344354731066305, + "learning_rate": 2.3534815627935397e-06, + "loss": 0.7002, + "step": 4188 + }, + { + "epoch": 2.0779446444085887, + "grad_norm": 0.12763100455109364, + "learning_rate": 2.352505904339986e-06, + "loss": 0.6759, + "step": 4189 + }, + { + "epoch": 2.078441107111828, + "grad_norm": 0.12628191599075542, + "learning_rate": 2.3515302684285398e-06, + "loss": 0.6725, + "step": 4190 + }, + { + "epoch": 2.078937569815068, + "grad_norm": 0.13183736319992664, + "learning_rate": 2.350554655208313e-06, + "loss": 0.6876, + "step": 4191 + }, + { + "epoch": 2.079434032518307, + "grad_norm": 0.1326348427538915, + "learning_rate": 2.349579064828409e-06, + "loss": 0.7015, + "step": 4192 + }, + { + "epoch": 2.0799304952215465, + "grad_norm": 0.12900392788556156, + "learning_rate": 2.3486034974379344e-06, + "loss": 0.6592, + "step": 4193 + }, + { + "epoch": 2.080426957924786, + "grad_norm": 0.12916628410416808, + "learning_rate": 2.347627953185987e-06, + "loss": 0.6965, + "step": 4194 + }, + { + "epoch": 2.080923420628025, + "grad_norm": 0.13911006307022702, + "learning_rate": 2.346652432221664e-06, + "loss": 0.7477, + "step": 4195 + }, + { + "epoch": 2.081419883331265, + "grad_norm": 0.12991109186278407, + "learning_rate": 2.3456769346940578e-06, + "loss": 0.7108, + "step": 4196 + }, + { + "epoch": 2.0819163460345043, + "grad_norm": 0.1310175516830794, + "learning_rate": 2.344701460752258e-06, + "loss": 0.72, + "step": 4197 + }, + { + "epoch": 2.0824128087377436, + "grad_norm": 0.1302438861387863, + "learning_rate": 2.34372601054535e-06, + "loss": 0.7045, + "step": 4198 + }, + { + "epoch": 2.082909271440983, + "grad_norm": 0.131287589350185, + "learning_rate": 2.3427505842224153e-06, + "loss": 0.6859, + "step": 4199 + }, + { + "epoch": 2.0834057341442223, + "grad_norm": 0.13636020184165526, + "learning_rate": 2.3417751819325327e-06, + "loss": 0.7343, + "step": 4200 + }, + { + "epoch": 2.083902196847462, + "grad_norm": 0.13036105078338428, + "learning_rate": 2.3407998038247767e-06, + "loss": 0.6583, + "step": 4201 + }, + { + "epoch": 2.0843986595507014, + "grad_norm": 0.13703412523752168, + "learning_rate": 2.339824450048218e-06, + "loss": 0.6875, + "step": 4202 + }, + { + "epoch": 2.0848951222539407, + "grad_norm": 0.1273783968662836, + "learning_rate": 2.3388491207519237e-06, + "loss": 0.6739, + "step": 4203 + }, + { + "epoch": 2.08539158495718, + "grad_norm": 0.1293141936432381, + "learning_rate": 2.337873816084958e-06, + "loss": 0.6807, + "step": 4204 + }, + { + "epoch": 2.0858880476604194, + "grad_norm": 0.13247637384139838, + "learning_rate": 2.3368985361963793e-06, + "loss": 0.7367, + "step": 4205 + }, + { + "epoch": 2.086384510363659, + "grad_norm": 0.12888079453701196, + "learning_rate": 2.3359232812352444e-06, + "loss": 0.6811, + "step": 4206 + }, + { + "epoch": 2.0868809730668985, + "grad_norm": 0.14305169239978224, + "learning_rate": 2.334948051350606e-06, + "loss": 0.6632, + "step": 4207 + }, + { + "epoch": 2.087377435770138, + "grad_norm": 0.13761847185157933, + "learning_rate": 2.333972846691511e-06, + "loss": 0.7505, + "step": 4208 + }, + { + "epoch": 2.087873898473377, + "grad_norm": 0.13472353385622, + "learning_rate": 2.3329976674070045e-06, + "loss": 0.7507, + "step": 4209 + }, + { + "epoch": 2.0883703611766165, + "grad_norm": 0.1332250195890825, + "learning_rate": 2.332022513646126e-06, + "loss": 0.686, + "step": 4210 + }, + { + "epoch": 2.0888668238798562, + "grad_norm": 0.13094514564456525, + "learning_rate": 2.331047385557914e-06, + "loss": 0.6997, + "step": 4211 + }, + { + "epoch": 2.0893632865830956, + "grad_norm": 0.13537482547078766, + "learning_rate": 2.3300722832913995e-06, + "loss": 0.6934, + "step": 4212 + }, + { + "epoch": 2.089859749286335, + "grad_norm": 0.13110966344634223, + "learning_rate": 2.329097206995612e-06, + "loss": 0.701, + "step": 4213 + }, + { + "epoch": 2.090356211989574, + "grad_norm": 0.13289972655864118, + "learning_rate": 2.3281221568195765e-06, + "loss": 0.6511, + "step": 4214 + }, + { + "epoch": 2.0908526746928136, + "grad_norm": 0.13256649712938065, + "learning_rate": 2.3271471329123134e-06, + "loss": 0.7757, + "step": 4215 + }, + { + "epoch": 2.0913491373960533, + "grad_norm": 0.12718273334445213, + "learning_rate": 2.3261721354228394e-06, + "loss": 0.6746, + "step": 4216 + }, + { + "epoch": 2.0918456000992927, + "grad_norm": 0.1279366898208473, + "learning_rate": 2.325197164500167e-06, + "loss": 0.7287, + "step": 4217 + }, + { + "epoch": 2.092342062802532, + "grad_norm": 0.1267772446319626, + "learning_rate": 2.324222220293306e-06, + "loss": 0.7235, + "step": 4218 + }, + { + "epoch": 2.0928385255057713, + "grad_norm": 0.12657392759783875, + "learning_rate": 2.3232473029512603e-06, + "loss": 0.67, + "step": 4219 + }, + { + "epoch": 2.0933349882090106, + "grad_norm": 0.13143006626346396, + "learning_rate": 2.3222724126230296e-06, + "loss": 0.6967, + "step": 4220 + }, + { + "epoch": 2.0938314509122504, + "grad_norm": 0.13250048306785056, + "learning_rate": 2.3212975494576113e-06, + "loss": 0.6659, + "step": 4221 + }, + { + "epoch": 2.0943279136154898, + "grad_norm": 0.1304492685665225, + "learning_rate": 2.3203227136039973e-06, + "loss": 0.754, + "step": 4222 + }, + { + "epoch": 2.094824376318729, + "grad_norm": 0.14113302529355157, + "learning_rate": 2.3193479052111763e-06, + "loss": 0.7788, + "step": 4223 + }, + { + "epoch": 2.0953208390219684, + "grad_norm": 0.12947346258330397, + "learning_rate": 2.3183731244281307e-06, + "loss": 0.6974, + "step": 4224 + }, + { + "epoch": 2.0958173017252077, + "grad_norm": 0.12969243044770545, + "learning_rate": 2.3173983714038413e-06, + "loss": 0.6881, + "step": 4225 + }, + { + "epoch": 2.0963137644284475, + "grad_norm": 0.12805474797723002, + "learning_rate": 2.3164236462872826e-06, + "loss": 0.6631, + "step": 4226 + }, + { + "epoch": 2.096810227131687, + "grad_norm": 0.12991846472801588, + "learning_rate": 2.315448949227426e-06, + "loss": 0.7117, + "step": 4227 + }, + { + "epoch": 2.097306689834926, + "grad_norm": 0.12737529203692569, + "learning_rate": 2.3144742803732386e-06, + "loss": 0.6619, + "step": 4228 + }, + { + "epoch": 2.0978031525381655, + "grad_norm": 0.1348388296052507, + "learning_rate": 2.3134996398736827e-06, + "loss": 0.6702, + "step": 4229 + }, + { + "epoch": 2.098299615241405, + "grad_norm": 0.12949314939302115, + "learning_rate": 2.3125250278777173e-06, + "loss": 0.6692, + "step": 4230 + }, + { + "epoch": 2.0987960779446446, + "grad_norm": 0.1319653430303982, + "learning_rate": 2.3115504445342947e-06, + "loss": 0.7186, + "step": 4231 + }, + { + "epoch": 2.099292540647884, + "grad_norm": 0.12754394160183852, + "learning_rate": 2.3105758899923652e-06, + "loss": 0.6983, + "step": 4232 + }, + { + "epoch": 2.0997890033511233, + "grad_norm": 0.13105593757086795, + "learning_rate": 2.3096013644008738e-06, + "loss": 0.7449, + "step": 4233 + }, + { + "epoch": 2.1002854660543626, + "grad_norm": 0.1352001782140455, + "learning_rate": 2.308626867908761e-06, + "loss": 0.7227, + "step": 4234 + }, + { + "epoch": 2.100781928757602, + "grad_norm": 0.1291811312002174, + "learning_rate": 2.3076524006649633e-06, + "loss": 0.6668, + "step": 4235 + }, + { + "epoch": 2.1012783914608413, + "grad_norm": 0.13013537752074916, + "learning_rate": 2.3066779628184115e-06, + "loss": 0.6691, + "step": 4236 + }, + { + "epoch": 2.101774854164081, + "grad_norm": 0.13054657602863368, + "learning_rate": 2.3057035545180337e-06, + "loss": 0.6978, + "step": 4237 + }, + { + "epoch": 2.1022713168673204, + "grad_norm": 0.12839328024574606, + "learning_rate": 2.3047291759127523e-06, + "loss": 0.6554, + "step": 4238 + }, + { + "epoch": 2.1027677795705597, + "grad_norm": 0.12996764440677216, + "learning_rate": 2.3037548271514863e-06, + "loss": 0.7042, + "step": 4239 + }, + { + "epoch": 2.103264242273799, + "grad_norm": 0.13523019374688494, + "learning_rate": 2.3027805083831475e-06, + "loss": 0.7477, + "step": 4240 + }, + { + "epoch": 2.103760704977039, + "grad_norm": 0.13283035054495967, + "learning_rate": 2.3018062197566464e-06, + "loss": 0.7262, + "step": 4241 + }, + { + "epoch": 2.104257167680278, + "grad_norm": 0.12953670758307892, + "learning_rate": 2.3008319614208863e-06, + "loss": 0.723, + "step": 4242 + }, + { + "epoch": 2.1047536303835175, + "grad_norm": 0.13144160332075827, + "learning_rate": 2.299857733524767e-06, + "loss": 0.7333, + "step": 4243 + }, + { + "epoch": 2.105250093086757, + "grad_norm": 0.12885131945661563, + "learning_rate": 2.2988835362171845e-06, + "loss": 0.736, + "step": 4244 + }, + { + "epoch": 2.105746555789996, + "grad_norm": 0.13082507084952819, + "learning_rate": 2.297909369647029e-06, + "loss": 0.7058, + "step": 4245 + }, + { + "epoch": 2.1062430184932355, + "grad_norm": 0.1308644150778887, + "learning_rate": 2.2969352339631864e-06, + "loss": 0.7137, + "step": 4246 + }, + { + "epoch": 2.1067394811964752, + "grad_norm": 0.13130767461578424, + "learning_rate": 2.295961129314537e-06, + "loss": 0.6985, + "step": 4247 + }, + { + "epoch": 2.1072359438997146, + "grad_norm": 0.13689384112058248, + "learning_rate": 2.294987055849957e-06, + "loss": 0.7249, + "step": 4248 + }, + { + "epoch": 2.107732406602954, + "grad_norm": 0.13528534527442918, + "learning_rate": 2.2940130137183187e-06, + "loss": 0.7141, + "step": 4249 + }, + { + "epoch": 2.1082288693061932, + "grad_norm": 0.1367594631782608, + "learning_rate": 2.2930390030684886e-06, + "loss": 0.7424, + "step": 4250 + }, + { + "epoch": 2.108725332009433, + "grad_norm": 0.13326510053921575, + "learning_rate": 2.292065024049329e-06, + "loss": 0.732, + "step": 4251 + }, + { + "epoch": 2.1092217947126723, + "grad_norm": 0.13486667743727, + "learning_rate": 2.291091076809696e-06, + "loss": 0.7336, + "step": 4252 + }, + { + "epoch": 2.1097182574159117, + "grad_norm": 0.1342722797696259, + "learning_rate": 2.2901171614984422e-06, + "loss": 0.7149, + "step": 4253 + }, + { + "epoch": 2.110214720119151, + "grad_norm": 0.12916009534662892, + "learning_rate": 2.2891432782644148e-06, + "loss": 0.6769, + "step": 4254 + }, + { + "epoch": 2.1107111828223903, + "grad_norm": 0.12895780049088768, + "learning_rate": 2.288169427256458e-06, + "loss": 0.6941, + "step": 4255 + }, + { + "epoch": 2.1112076455256297, + "grad_norm": 0.13336065989780396, + "learning_rate": 2.2871956086234066e-06, + "loss": 0.7312, + "step": 4256 + }, + { + "epoch": 2.1117041082288694, + "grad_norm": 0.1311287040733774, + "learning_rate": 2.2862218225140948e-06, + "loss": 0.6849, + "step": 4257 + }, + { + "epoch": 2.112200570932109, + "grad_norm": 0.1326013584545713, + "learning_rate": 2.2852480690773493e-06, + "loss": 0.7127, + "step": 4258 + }, + { + "epoch": 2.112697033635348, + "grad_norm": 0.1293700176479527, + "learning_rate": 2.284274348461993e-06, + "loss": 0.6787, + "step": 4259 + }, + { + "epoch": 2.1131934963385874, + "grad_norm": 0.1270172759070305, + "learning_rate": 2.283300660816844e-06, + "loss": 0.7109, + "step": 4260 + }, + { + "epoch": 2.1136899590418268, + "grad_norm": 0.12987078013128955, + "learning_rate": 2.282327006290714e-06, + "loss": 0.6754, + "step": 4261 + }, + { + "epoch": 2.1141864217450665, + "grad_norm": 0.1311168878775037, + "learning_rate": 2.281353385032412e-06, + "loss": 0.6938, + "step": 4262 + }, + { + "epoch": 2.114682884448306, + "grad_norm": 0.13308422505081596, + "learning_rate": 2.2803797971907384e-06, + "loss": 0.7151, + "step": 4263 + }, + { + "epoch": 2.115179347151545, + "grad_norm": 0.1305774573301847, + "learning_rate": 2.2794062429144916e-06, + "loss": 0.7384, + "step": 4264 + }, + { + "epoch": 2.1156758098547845, + "grad_norm": 0.13262558018414852, + "learning_rate": 2.278432722352464e-06, + "loss": 0.7158, + "step": 4265 + }, + { + "epoch": 2.116172272558024, + "grad_norm": 0.1321143377532212, + "learning_rate": 2.2774592356534417e-06, + "loss": 0.644, + "step": 4266 + }, + { + "epoch": 2.1166687352612636, + "grad_norm": 0.13040322905026952, + "learning_rate": 2.276485782966208e-06, + "loss": 0.6894, + "step": 4267 + }, + { + "epoch": 2.117165197964503, + "grad_norm": 0.14208837528045287, + "learning_rate": 2.2755123644395377e-06, + "loss": 0.727, + "step": 4268 + }, + { + "epoch": 2.1176616606677423, + "grad_norm": 0.14092145083010946, + "learning_rate": 2.2745389802222034e-06, + "loss": 0.7016, + "step": 4269 + }, + { + "epoch": 2.1181581233709816, + "grad_norm": 0.12913632924933782, + "learning_rate": 2.2735656304629706e-06, + "loss": 0.7004, + "step": 4270 + }, + { + "epoch": 2.118654586074221, + "grad_norm": 0.1334111014407336, + "learning_rate": 2.272592315310601e-06, + "loss": 0.7373, + "step": 4271 + }, + { + "epoch": 2.1191510487774607, + "grad_norm": 0.13363016697183605, + "learning_rate": 2.2716190349138505e-06, + "loss": 0.7042, + "step": 4272 + }, + { + "epoch": 2.1196475114807, + "grad_norm": 0.13210258534803404, + "learning_rate": 2.270645789421468e-06, + "loss": 0.7621, + "step": 4273 + }, + { + "epoch": 2.1201439741839394, + "grad_norm": 0.13777719715311174, + "learning_rate": 2.2696725789821994e-06, + "loss": 0.7225, + "step": 4274 + }, + { + "epoch": 2.1206404368871787, + "grad_norm": 0.1308204005740167, + "learning_rate": 2.268699403744784e-06, + "loss": 0.7062, + "step": 4275 + }, + { + "epoch": 2.121136899590418, + "grad_norm": 0.12957459770630428, + "learning_rate": 2.2677262638579554e-06, + "loss": 0.6877, + "step": 4276 + }, + { + "epoch": 2.121633362293658, + "grad_norm": 0.13016124393616152, + "learning_rate": 2.266753159470444e-06, + "loss": 0.6826, + "step": 4277 + }, + { + "epoch": 2.122129824996897, + "grad_norm": 0.12806205702427592, + "learning_rate": 2.265780090730972e-06, + "loss": 0.645, + "step": 4278 + }, + { + "epoch": 2.1226262877001365, + "grad_norm": 0.1337185042729098, + "learning_rate": 2.2648070577882573e-06, + "loss": 0.7274, + "step": 4279 + }, + { + "epoch": 2.123122750403376, + "grad_norm": 0.13067486282767551, + "learning_rate": 2.2638340607910133e-06, + "loss": 0.7087, + "step": 4280 + }, + { + "epoch": 2.123619213106615, + "grad_norm": 0.13386479780042845, + "learning_rate": 2.2628610998879463e-06, + "loss": 0.7024, + "step": 4281 + }, + { + "epoch": 2.124115675809855, + "grad_norm": 0.13272729071166328, + "learning_rate": 2.261888175227758e-06, + "loss": 0.7484, + "step": 4282 + }, + { + "epoch": 2.1246121385130943, + "grad_norm": 0.1310493554217813, + "learning_rate": 2.2609152869591445e-06, + "loss": 0.651, + "step": 4283 + }, + { + "epoch": 2.1251086012163336, + "grad_norm": 0.1316640668680429, + "learning_rate": 2.2599424352307958e-06, + "loss": 0.755, + "step": 4284 + }, + { + "epoch": 2.125605063919573, + "grad_norm": 0.12948769043340413, + "learning_rate": 2.2589696201913966e-06, + "loss": 0.7538, + "step": 4285 + }, + { + "epoch": 2.1261015266228123, + "grad_norm": 0.1259209524836695, + "learning_rate": 2.2579968419896264e-06, + "loss": 0.7165, + "step": 4286 + }, + { + "epoch": 2.126597989326052, + "grad_norm": 0.1274147611602142, + "learning_rate": 2.257024100774159e-06, + "loss": 0.7053, + "step": 4287 + }, + { + "epoch": 2.1270944520292914, + "grad_norm": 0.1273913753461934, + "learning_rate": 2.2560513966936626e-06, + "loss": 0.6987, + "step": 4288 + }, + { + "epoch": 2.1275909147325307, + "grad_norm": 0.17368980947168566, + "learning_rate": 2.255078729896798e-06, + "loss": 0.6387, + "step": 4289 + }, + { + "epoch": 2.12808737743577, + "grad_norm": 0.12393658684142564, + "learning_rate": 2.254106100532223e-06, + "loss": 0.673, + "step": 4290 + }, + { + "epoch": 2.1285838401390094, + "grad_norm": 0.12933453592109587, + "learning_rate": 2.253133508748587e-06, + "loss": 0.7271, + "step": 4291 + }, + { + "epoch": 2.129080302842249, + "grad_norm": 0.1343441282973702, + "learning_rate": 2.252160954694536e-06, + "loss": 0.6757, + "step": 4292 + }, + { + "epoch": 2.1295767655454885, + "grad_norm": 0.13289063203957274, + "learning_rate": 2.2511884385187098e-06, + "loss": 0.735, + "step": 4293 + }, + { + "epoch": 2.130073228248728, + "grad_norm": 0.12926554854547287, + "learning_rate": 2.250215960369741e-06, + "loss": 0.6849, + "step": 4294 + }, + { + "epoch": 2.130569690951967, + "grad_norm": 0.13121955352587697, + "learning_rate": 2.2492435203962584e-06, + "loss": 0.6844, + "step": 4295 + }, + { + "epoch": 2.1310661536552065, + "grad_norm": 0.13785478772610635, + "learning_rate": 2.2482711187468825e-06, + "loss": 0.6656, + "step": 4296 + }, + { + "epoch": 2.1315626163584462, + "grad_norm": 0.13675030797419835, + "learning_rate": 2.24729875557023e-06, + "loss": 0.7755, + "step": 4297 + }, + { + "epoch": 2.1320590790616856, + "grad_norm": 0.13218540295978803, + "learning_rate": 2.246326431014911e-06, + "loss": 0.7865, + "step": 4298 + }, + { + "epoch": 2.132555541764925, + "grad_norm": 0.13058917038281245, + "learning_rate": 2.2453541452295304e-06, + "loss": 0.6822, + "step": 4299 + }, + { + "epoch": 2.1330520044681642, + "grad_norm": 0.1356025414078166, + "learning_rate": 2.2443818983626845e-06, + "loss": 0.7416, + "step": 4300 + }, + { + "epoch": 2.1335484671714036, + "grad_norm": 0.13259643816943026, + "learning_rate": 2.2434096905629675e-06, + "loss": 0.7331, + "step": 4301 + }, + { + "epoch": 2.1340449298746433, + "grad_norm": 0.13178426485776873, + "learning_rate": 2.242437521978965e-06, + "loss": 0.7416, + "step": 4302 + }, + { + "epoch": 2.1345413925778827, + "grad_norm": 0.12712500731582294, + "learning_rate": 2.2414653927592578e-06, + "loss": 0.6676, + "step": 4303 + }, + { + "epoch": 2.135037855281122, + "grad_norm": 0.132762631536009, + "learning_rate": 2.240493303052421e-06, + "loss": 0.7412, + "step": 4304 + }, + { + "epoch": 2.1355343179843613, + "grad_norm": 0.13927370521334112, + "learning_rate": 2.239521253007021e-06, + "loss": 0.7256, + "step": 4305 + }, + { + "epoch": 2.1360307806876007, + "grad_norm": 0.1298913459257255, + "learning_rate": 2.238549242771621e-06, + "loss": 0.726, + "step": 4306 + }, + { + "epoch": 2.1365272433908404, + "grad_norm": 0.13103490851669664, + "learning_rate": 2.237577272494777e-06, + "loss": 0.7101, + "step": 4307 + }, + { + "epoch": 2.1370237060940798, + "grad_norm": 0.13052143851126424, + "learning_rate": 2.2366053423250396e-06, + "loss": 0.7063, + "step": 4308 + }, + { + "epoch": 2.137520168797319, + "grad_norm": 0.1325952941973161, + "learning_rate": 2.235633452410952e-06, + "loss": 0.7102, + "step": 4309 + }, + { + "epoch": 2.1380166315005584, + "grad_norm": 0.13151130162140853, + "learning_rate": 2.2346616029010527e-06, + "loss": 0.72, + "step": 4310 + }, + { + "epoch": 2.1385130942037978, + "grad_norm": 0.13684162828630525, + "learning_rate": 2.2336897939438734e-06, + "loss": 0.7484, + "step": 4311 + }, + { + "epoch": 2.1390095569070375, + "grad_norm": 0.13016064599020155, + "learning_rate": 2.2327180256879384e-06, + "loss": 0.7219, + "step": 4312 + }, + { + "epoch": 2.139506019610277, + "grad_norm": 0.13271882669694315, + "learning_rate": 2.231746298281768e-06, + "loss": 0.7285, + "step": 4313 + }, + { + "epoch": 2.140002482313516, + "grad_norm": 0.12493929654212456, + "learning_rate": 2.230774611873875e-06, + "loss": 0.6597, + "step": 4314 + }, + { + "epoch": 2.1404989450167555, + "grad_norm": 0.1270152459435393, + "learning_rate": 2.2298029666127654e-06, + "loss": 0.6268, + "step": 4315 + }, + { + "epoch": 2.140995407719995, + "grad_norm": 0.13120756963636904, + "learning_rate": 2.2288313626469403e-06, + "loss": 0.7737, + "step": 4316 + }, + { + "epoch": 2.1414918704232346, + "grad_norm": 0.13093118304542808, + "learning_rate": 2.2278598001248935e-06, + "loss": 0.7182, + "step": 4317 + }, + { + "epoch": 2.141988333126474, + "grad_norm": 0.13406739346906132, + "learning_rate": 2.2268882791951125e-06, + "loss": 0.6923, + "step": 4318 + }, + { + "epoch": 2.1424847958297133, + "grad_norm": 0.1321265210500075, + "learning_rate": 2.2259168000060793e-06, + "loss": 0.7341, + "step": 4319 + }, + { + "epoch": 2.1429812585329526, + "grad_norm": 0.1297032816599765, + "learning_rate": 2.2249453627062697e-06, + "loss": 0.6591, + "step": 4320 + }, + { + "epoch": 2.143477721236192, + "grad_norm": 0.13306620569232747, + "learning_rate": 2.22397396744415e-06, + "loss": 0.7009, + "step": 4321 + }, + { + "epoch": 2.1439741839394317, + "grad_norm": 0.1315841117210727, + "learning_rate": 2.223002614368184e-06, + "loss": 0.693, + "step": 4322 + }, + { + "epoch": 2.144470646642671, + "grad_norm": 0.13522931404154925, + "learning_rate": 2.2220313036268275e-06, + "loss": 0.7885, + "step": 4323 + }, + { + "epoch": 2.1449671093459104, + "grad_norm": 0.13000494547408423, + "learning_rate": 2.2210600353685286e-06, + "loss": 0.7126, + "step": 4324 + }, + { + "epoch": 2.1454635720491497, + "grad_norm": 0.1331174145801834, + "learning_rate": 2.2200888097417308e-06, + "loss": 0.7385, + "step": 4325 + }, + { + "epoch": 2.145960034752389, + "grad_norm": 0.13199478030805042, + "learning_rate": 2.2191176268948707e-06, + "loss": 0.6859, + "step": 4326 + }, + { + "epoch": 2.146456497455629, + "grad_norm": 0.12924843473690692, + "learning_rate": 2.2181464869763777e-06, + "loss": 0.7077, + "step": 4327 + }, + { + "epoch": 2.146952960158868, + "grad_norm": 0.13383047347606078, + "learning_rate": 2.217175390134675e-06, + "loss": 0.7301, + "step": 4328 + }, + { + "epoch": 2.1474494228621075, + "grad_norm": 0.1301216163563912, + "learning_rate": 2.216204336518179e-06, + "loss": 0.7075, + "step": 4329 + }, + { + "epoch": 2.147945885565347, + "grad_norm": 0.13513976833741134, + "learning_rate": 2.2152333262752996e-06, + "loss": 0.7069, + "step": 4330 + }, + { + "epoch": 2.148442348268586, + "grad_norm": 0.12921645151422304, + "learning_rate": 2.2142623595544407e-06, + "loss": 0.7121, + "step": 4331 + }, + { + "epoch": 2.148938810971826, + "grad_norm": 0.134789032200641, + "learning_rate": 2.2132914365039993e-06, + "loss": 0.7279, + "step": 4332 + }, + { + "epoch": 2.1494352736750653, + "grad_norm": 0.12767095954972132, + "learning_rate": 2.2123205572723638e-06, + "loss": 0.6819, + "step": 4333 + }, + { + "epoch": 2.1499317363783046, + "grad_norm": 0.132607654917447, + "learning_rate": 2.211349722007919e-06, + "loss": 0.7035, + "step": 4334 + }, + { + "epoch": 2.150428199081544, + "grad_norm": 0.13081966486435517, + "learning_rate": 2.210378930859041e-06, + "loss": 0.7125, + "step": 4335 + }, + { + "epoch": 2.1509246617847833, + "grad_norm": 0.13392118968950856, + "learning_rate": 2.2094081839741007e-06, + "loss": 0.6874, + "step": 4336 + }, + { + "epoch": 2.151421124488023, + "grad_norm": 0.13563928410684517, + "learning_rate": 2.208437481501459e-06, + "loss": 0.7077, + "step": 4337 + }, + { + "epoch": 2.1519175871912624, + "grad_norm": 0.13102290237210468, + "learning_rate": 2.207466823589474e-06, + "loss": 0.7314, + "step": 4338 + }, + { + "epoch": 2.1524140498945017, + "grad_norm": 0.1308681684431675, + "learning_rate": 2.206496210386494e-06, + "loss": 0.6915, + "step": 4339 + }, + { + "epoch": 2.152910512597741, + "grad_norm": 0.13474477124109288, + "learning_rate": 2.2055256420408625e-06, + "loss": 0.7038, + "step": 4340 + }, + { + "epoch": 2.1534069753009804, + "grad_norm": 0.13030093914916455, + "learning_rate": 2.2045551187009147e-06, + "loss": 0.7066, + "step": 4341 + }, + { + "epoch": 2.15390343800422, + "grad_norm": 0.12838709690946015, + "learning_rate": 2.20358464051498e-06, + "loss": 0.6839, + "step": 4342 + }, + { + "epoch": 2.1543999007074595, + "grad_norm": 0.13054218481985586, + "learning_rate": 2.2026142076313816e-06, + "loss": 0.7189, + "step": 4343 + }, + { + "epoch": 2.154896363410699, + "grad_norm": 0.1329355796495672, + "learning_rate": 2.201643820198432e-06, + "loss": 0.7484, + "step": 4344 + }, + { + "epoch": 2.155392826113938, + "grad_norm": 0.12685915768729691, + "learning_rate": 2.2006734783644407e-06, + "loss": 0.6744, + "step": 4345 + }, + { + "epoch": 2.1558892888171775, + "grad_norm": 0.12969948887272129, + "learning_rate": 2.1997031822777093e-06, + "loss": 0.6559, + "step": 4346 + }, + { + "epoch": 2.1563857515204172, + "grad_norm": 0.12690777845885462, + "learning_rate": 2.1987329320865314e-06, + "loss": 0.6719, + "step": 4347 + }, + { + "epoch": 2.1568822142236566, + "grad_norm": 0.13746314892957628, + "learning_rate": 2.197762727939195e-06, + "loss": 0.7068, + "step": 4348 + }, + { + "epoch": 2.157378676926896, + "grad_norm": 0.13974129132766783, + "learning_rate": 2.196792569983979e-06, + "loss": 0.7702, + "step": 4349 + }, + { + "epoch": 2.157875139630135, + "grad_norm": 0.13205901471380596, + "learning_rate": 2.195822458369157e-06, + "loss": 0.6789, + "step": 4350 + }, + { + "epoch": 2.1583716023333746, + "grad_norm": 0.13151573181044662, + "learning_rate": 2.194852393242995e-06, + "loss": 0.6997, + "step": 4351 + }, + { + "epoch": 2.1588680650366143, + "grad_norm": 0.1311550041985863, + "learning_rate": 2.1938823747537535e-06, + "loss": 0.7186, + "step": 4352 + }, + { + "epoch": 2.1593645277398537, + "grad_norm": 0.12785154545568428, + "learning_rate": 2.192912403049682e-06, + "loss": 0.7337, + "step": 4353 + }, + { + "epoch": 2.159860990443093, + "grad_norm": 0.1384165497626903, + "learning_rate": 2.1919424782790256e-06, + "loss": 0.7423, + "step": 4354 + }, + { + "epoch": 2.1603574531463323, + "grad_norm": 0.1326394817355734, + "learning_rate": 2.1909726005900224e-06, + "loss": 0.7139, + "step": 4355 + }, + { + "epoch": 2.1608539158495716, + "grad_norm": 0.1332787692360546, + "learning_rate": 2.1900027701309016e-06, + "loss": 0.7198, + "step": 4356 + }, + { + "epoch": 2.1613503785528114, + "grad_norm": 0.13058676686648546, + "learning_rate": 2.1890329870498873e-06, + "loss": 0.6919, + "step": 4357 + }, + { + "epoch": 2.1618468412560508, + "grad_norm": 0.12920400653725772, + "learning_rate": 2.188063251495195e-06, + "loss": 0.7204, + "step": 4358 + }, + { + "epoch": 2.16234330395929, + "grad_norm": 0.12727242356432697, + "learning_rate": 2.187093563615034e-06, + "loss": 0.7072, + "step": 4359 + }, + { + "epoch": 2.1628397666625294, + "grad_norm": 0.12536367743791044, + "learning_rate": 2.1861239235576033e-06, + "loss": 0.6552, + "step": 4360 + }, + { + "epoch": 2.1633362293657687, + "grad_norm": 0.12786174220800478, + "learning_rate": 2.185154331471099e-06, + "loss": 0.6844, + "step": 4361 + }, + { + "epoch": 2.1638326920690085, + "grad_norm": 0.13320070492782934, + "learning_rate": 2.1841847875037065e-06, + "loss": 0.7398, + "step": 4362 + }, + { + "epoch": 2.164329154772248, + "grad_norm": 0.1388558108961887, + "learning_rate": 2.1832152918036058e-06, + "loss": 0.7204, + "step": 4363 + }, + { + "epoch": 2.164825617475487, + "grad_norm": 0.12693695099985525, + "learning_rate": 2.182245844518969e-06, + "loss": 0.677, + "step": 4364 + }, + { + "epoch": 2.1653220801787265, + "grad_norm": 0.12924014986291152, + "learning_rate": 2.1812764457979594e-06, + "loss": 0.6683, + "step": 4365 + }, + { + "epoch": 2.165818542881966, + "grad_norm": 0.13159270443065568, + "learning_rate": 2.1803070957887348e-06, + "loss": 0.7281, + "step": 4366 + }, + { + "epoch": 2.166315005585205, + "grad_norm": 0.13127772161024706, + "learning_rate": 2.1793377946394448e-06, + "loss": 0.6727, + "step": 4367 + }, + { + "epoch": 2.166811468288445, + "grad_norm": 0.12854307859962671, + "learning_rate": 2.1783685424982326e-06, + "loss": 0.6921, + "step": 4368 + }, + { + "epoch": 2.1673079309916843, + "grad_norm": 0.13019316986612073, + "learning_rate": 2.1773993395132303e-06, + "loss": 0.6875, + "step": 4369 + }, + { + "epoch": 2.1678043936949236, + "grad_norm": 0.1353274833386413, + "learning_rate": 2.176430185832567e-06, + "loss": 0.7027, + "step": 4370 + }, + { + "epoch": 2.168300856398163, + "grad_norm": 0.13265862920248359, + "learning_rate": 2.175461081604362e-06, + "loss": 0.6899, + "step": 4371 + }, + { + "epoch": 2.1687973191014027, + "grad_norm": 0.13334516546543757, + "learning_rate": 2.1744920269767266e-06, + "loss": 0.6968, + "step": 4372 + }, + { + "epoch": 2.169293781804642, + "grad_norm": 0.13704956210912356, + "learning_rate": 2.1735230220977653e-06, + "loss": 0.7129, + "step": 4373 + }, + { + "epoch": 2.1697902445078814, + "grad_norm": 0.13014552774663893, + "learning_rate": 2.172554067115576e-06, + "loss": 0.7324, + "step": 4374 + }, + { + "epoch": 2.1702867072111207, + "grad_norm": 0.1281661360302656, + "learning_rate": 2.1715851621782473e-06, + "loss": 0.6973, + "step": 4375 + }, + { + "epoch": 2.17078316991436, + "grad_norm": 0.13330711985579916, + "learning_rate": 2.170616307433861e-06, + "loss": 0.7038, + "step": 4376 + }, + { + "epoch": 2.1712796326175994, + "grad_norm": 0.13056351010723308, + "learning_rate": 2.1696475030304902e-06, + "loss": 0.6866, + "step": 4377 + }, + { + "epoch": 2.171776095320839, + "grad_norm": 0.13268943046062295, + "learning_rate": 2.1686787491162023e-06, + "loss": 0.6891, + "step": 4378 + }, + { + "epoch": 2.1722725580240785, + "grad_norm": 0.13924146096251458, + "learning_rate": 2.1677100458390547e-06, + "loss": 0.6916, + "step": 4379 + }, + { + "epoch": 2.172769020727318, + "grad_norm": 0.13972180628069247, + "learning_rate": 2.1667413933470998e-06, + "loss": 0.7463, + "step": 4380 + }, + { + "epoch": 2.173265483430557, + "grad_norm": 0.13661608957760332, + "learning_rate": 2.165772791788379e-06, + "loss": 0.7453, + "step": 4381 + }, + { + "epoch": 2.173761946133797, + "grad_norm": 0.1282782445633921, + "learning_rate": 2.1648042413109276e-06, + "loss": 0.7236, + "step": 4382 + }, + { + "epoch": 2.1742584088370362, + "grad_norm": 0.12716164432242413, + "learning_rate": 2.163835742062774e-06, + "loss": 0.687, + "step": 4383 + }, + { + "epoch": 2.1747548715402756, + "grad_norm": 0.13109851282826201, + "learning_rate": 2.162867294191938e-06, + "loss": 0.7189, + "step": 4384 + }, + { + "epoch": 2.175251334243515, + "grad_norm": 0.14106981155374845, + "learning_rate": 2.1618988978464296e-06, + "loss": 0.7263, + "step": 4385 + }, + { + "epoch": 2.1757477969467542, + "grad_norm": 0.1287290868986218, + "learning_rate": 2.1609305531742534e-06, + "loss": 0.6981, + "step": 4386 + }, + { + "epoch": 2.1762442596499936, + "grad_norm": 0.1294869956830014, + "learning_rate": 2.159962260323406e-06, + "loss": 0.7166, + "step": 4387 + }, + { + "epoch": 2.1767407223532333, + "grad_norm": 0.13558488604187074, + "learning_rate": 2.158994019441875e-06, + "loss": 0.6741, + "step": 4388 + }, + { + "epoch": 2.1772371850564727, + "grad_norm": 0.1337077290344641, + "learning_rate": 2.15802583067764e-06, + "loss": 0.712, + "step": 4389 + }, + { + "epoch": 2.177733647759712, + "grad_norm": 0.13200934226771094, + "learning_rate": 2.157057694178674e-06, + "loss": 0.6919, + "step": 4390 + }, + { + "epoch": 2.1782301104629513, + "grad_norm": 0.12792142983023513, + "learning_rate": 2.1560896100929413e-06, + "loss": 0.672, + "step": 4391 + }, + { + "epoch": 2.178726573166191, + "grad_norm": 0.14201553304959288, + "learning_rate": 2.155121578568397e-06, + "loss": 0.7635, + "step": 4392 + }, + { + "epoch": 2.1792230358694304, + "grad_norm": 0.12863093955613852, + "learning_rate": 2.1541535997529894e-06, + "loss": 0.6829, + "step": 4393 + }, + { + "epoch": 2.17971949857267, + "grad_norm": 0.13051647735684252, + "learning_rate": 2.153185673794659e-06, + "loss": 0.6927, + "step": 4394 + }, + { + "epoch": 2.180215961275909, + "grad_norm": 0.1378899333922816, + "learning_rate": 2.1522178008413376e-06, + "loss": 0.7242, + "step": 4395 + }, + { + "epoch": 2.1807124239791484, + "grad_norm": 0.12915706282038747, + "learning_rate": 2.15124998104095e-06, + "loss": 0.7223, + "step": 4396 + }, + { + "epoch": 2.1812088866823878, + "grad_norm": 0.13756383241184975, + "learning_rate": 2.15028221454141e-06, + "loss": 0.7269, + "step": 4397 + }, + { + "epoch": 2.1817053493856275, + "grad_norm": 0.13566981965904096, + "learning_rate": 2.1493145014906264e-06, + "loss": 0.7178, + "step": 4398 + }, + { + "epoch": 2.182201812088867, + "grad_norm": 0.13120681413105775, + "learning_rate": 2.1483468420364984e-06, + "loss": 0.6638, + "step": 4399 + }, + { + "epoch": 2.182698274792106, + "grad_norm": 0.13010531035199235, + "learning_rate": 2.1473792363269183e-06, + "loss": 0.7361, + "step": 4400 + }, + { + "epoch": 2.1831947374953455, + "grad_norm": 0.13102941992333803, + "learning_rate": 2.1464116845097672e-06, + "loss": 0.6625, + "step": 4401 + }, + { + "epoch": 2.1836912001985853, + "grad_norm": 0.13143047623874493, + "learning_rate": 2.1454441867329205e-06, + "loss": 0.7456, + "step": 4402 + }, + { + "epoch": 2.1841876629018246, + "grad_norm": 0.12849792172954375, + "learning_rate": 2.1444767431442455e-06, + "loss": 0.6448, + "step": 4403 + }, + { + "epoch": 2.184684125605064, + "grad_norm": 0.1339908605256051, + "learning_rate": 2.1435093538916e-06, + "loss": 0.7366, + "step": 4404 + }, + { + "epoch": 2.1851805883083033, + "grad_norm": 0.1355852126115786, + "learning_rate": 2.1425420191228328e-06, + "loss": 0.7442, + "step": 4405 + }, + { + "epoch": 2.1856770510115426, + "grad_norm": 0.13537532518922524, + "learning_rate": 2.1415747389857875e-06, + "loss": 0.7584, + "step": 4406 + }, + { + "epoch": 2.186173513714782, + "grad_norm": 0.1376705768549547, + "learning_rate": 2.140607513628296e-06, + "loss": 0.6996, + "step": 4407 + }, + { + "epoch": 2.1866699764180217, + "grad_norm": 0.13019400686207325, + "learning_rate": 2.1396403431981843e-06, + "loss": 0.6652, + "step": 4408 + }, + { + "epoch": 2.187166439121261, + "grad_norm": 0.1322767474800185, + "learning_rate": 2.1386732278432674e-06, + "loss": 0.6934, + "step": 4409 + }, + { + "epoch": 2.1876629018245004, + "grad_norm": 0.13168416201103167, + "learning_rate": 2.1377061677113547e-06, + "loss": 0.6779, + "step": 4410 + }, + { + "epoch": 2.1881593645277397, + "grad_norm": 0.12841437119394158, + "learning_rate": 2.136739162950245e-06, + "loss": 0.6991, + "step": 4411 + }, + { + "epoch": 2.188655827230979, + "grad_norm": 0.13341291386169005, + "learning_rate": 2.13577221370773e-06, + "loss": 0.6849, + "step": 4412 + }, + { + "epoch": 2.189152289934219, + "grad_norm": 0.13556040065635502, + "learning_rate": 2.1348053201315926e-06, + "loss": 0.7101, + "step": 4413 + }, + { + "epoch": 2.189648752637458, + "grad_norm": 0.1347896034578669, + "learning_rate": 2.1338384823696056e-06, + "loss": 0.7173, + "step": 4414 + }, + { + "epoch": 2.1901452153406975, + "grad_norm": 0.13265640243012952, + "learning_rate": 2.1328717005695363e-06, + "loss": 0.6666, + "step": 4415 + }, + { + "epoch": 2.190641678043937, + "grad_norm": 0.1283246868922253, + "learning_rate": 2.1319049748791418e-06, + "loss": 0.7144, + "step": 4416 + }, + { + "epoch": 2.191138140747176, + "grad_norm": 0.13569707723504326, + "learning_rate": 2.1309383054461692e-06, + "loss": 0.7276, + "step": 4417 + }, + { + "epoch": 2.191634603450416, + "grad_norm": 0.1350932272158915, + "learning_rate": 2.1299716924183586e-06, + "loss": 0.7084, + "step": 4418 + }, + { + "epoch": 2.1921310661536553, + "grad_norm": 0.1293789781280897, + "learning_rate": 2.1290051359434426e-06, + "loss": 0.7082, + "step": 4419 + }, + { + "epoch": 2.1926275288568946, + "grad_norm": 0.1334911228325809, + "learning_rate": 2.128038636169143e-06, + "loss": 0.694, + "step": 4420 + }, + { + "epoch": 2.193123991560134, + "grad_norm": 0.13202766897134072, + "learning_rate": 2.1270721932431736e-06, + "loss": 0.7453, + "step": 4421 + }, + { + "epoch": 2.1936204542633733, + "grad_norm": 0.132585729157581, + "learning_rate": 2.1261058073132403e-06, + "loss": 0.7075, + "step": 4422 + }, + { + "epoch": 2.194116916966613, + "grad_norm": 0.13273752133950062, + "learning_rate": 2.1251394785270388e-06, + "loss": 0.7006, + "step": 4423 + }, + { + "epoch": 2.1946133796698524, + "grad_norm": 0.13508600941080354, + "learning_rate": 2.1241732070322586e-06, + "loss": 0.7252, + "step": 4424 + }, + { + "epoch": 2.1951098423730917, + "grad_norm": 0.12963329955825967, + "learning_rate": 2.123206992976577e-06, + "loss": 0.7375, + "step": 4425 + }, + { + "epoch": 2.195606305076331, + "grad_norm": 0.12468851759067956, + "learning_rate": 2.122240836507665e-06, + "loss": 0.6814, + "step": 4426 + }, + { + "epoch": 2.1961027677795704, + "grad_norm": 0.13146278221760682, + "learning_rate": 2.1212747377731845e-06, + "loss": 0.721, + "step": 4427 + }, + { + "epoch": 2.19659923048281, + "grad_norm": 0.14207414385746342, + "learning_rate": 2.120308696920789e-06, + "loss": 0.749, + "step": 4428 + }, + { + "epoch": 2.1970956931860495, + "grad_norm": 0.1299724476761059, + "learning_rate": 2.11934271409812e-06, + "loss": 0.6992, + "step": 4429 + }, + { + "epoch": 2.197592155889289, + "grad_norm": 0.13750736020293242, + "learning_rate": 2.1183767894528135e-06, + "loss": 0.7027, + "step": 4430 + }, + { + "epoch": 2.198088618592528, + "grad_norm": 0.1327599147789671, + "learning_rate": 2.1174109231324965e-06, + "loss": 0.661, + "step": 4431 + }, + { + "epoch": 2.1985850812957675, + "grad_norm": 0.13320281277199114, + "learning_rate": 2.1164451152847865e-06, + "loss": 0.7928, + "step": 4432 + }, + { + "epoch": 2.1990815439990072, + "grad_norm": 0.12945660859813715, + "learning_rate": 2.1154793660572897e-06, + "loss": 0.7051, + "step": 4433 + }, + { + "epoch": 2.1995780067022466, + "grad_norm": 0.1404344217727835, + "learning_rate": 2.1145136755976063e-06, + "loss": 0.7047, + "step": 4434 + }, + { + "epoch": 2.200074469405486, + "grad_norm": 0.13253286798252542, + "learning_rate": 2.1135480440533275e-06, + "loss": 0.687, + "step": 4435 + }, + { + "epoch": 2.2005709321087252, + "grad_norm": 0.1315732384032246, + "learning_rate": 2.1125824715720335e-06, + "loss": 0.7066, + "step": 4436 + }, + { + "epoch": 2.2010673948119646, + "grad_norm": 0.12983170486233375, + "learning_rate": 2.1116169583012965e-06, + "loss": 0.7092, + "step": 4437 + }, + { + "epoch": 2.2015638575152043, + "grad_norm": 0.12962669211533243, + "learning_rate": 2.1106515043886804e-06, + "loss": 0.7187, + "step": 4438 + }, + { + "epoch": 2.2020603202184437, + "grad_norm": 0.13258204247552738, + "learning_rate": 2.1096861099817394e-06, + "loss": 0.7111, + "step": 4439 + }, + { + "epoch": 2.202556782921683, + "grad_norm": 0.13733375443222062, + "learning_rate": 2.1087207752280186e-06, + "loss": 0.7446, + "step": 4440 + }, + { + "epoch": 2.2030532456249223, + "grad_norm": 0.1335139785372661, + "learning_rate": 2.107755500275053e-06, + "loss": 0.733, + "step": 4441 + }, + { + "epoch": 2.2035497083281617, + "grad_norm": 0.12985134152635452, + "learning_rate": 2.10679028527037e-06, + "loss": 0.7306, + "step": 4442 + }, + { + "epoch": 2.2040461710314014, + "grad_norm": 0.12811473834247125, + "learning_rate": 2.1058251303614875e-06, + "loss": 0.7068, + "step": 4443 + }, + { + "epoch": 2.2045426337346408, + "grad_norm": 0.12645032456670896, + "learning_rate": 2.1048600356959133e-06, + "loss": 0.6633, + "step": 4444 + }, + { + "epoch": 2.20503909643788, + "grad_norm": 0.13171507130184576, + "learning_rate": 2.103895001421148e-06, + "loss": 0.6973, + "step": 4445 + }, + { + "epoch": 2.2055355591411194, + "grad_norm": 0.1302019612259144, + "learning_rate": 2.10293002768468e-06, + "loss": 0.68, + "step": 4446 + }, + { + "epoch": 2.2060320218443588, + "grad_norm": 0.12865346316964513, + "learning_rate": 2.101965114633991e-06, + "loss": 0.7241, + "step": 4447 + }, + { + "epoch": 2.2065284845475985, + "grad_norm": 0.12938944539941274, + "learning_rate": 2.1010002624165528e-06, + "loss": 0.7068, + "step": 4448 + }, + { + "epoch": 2.207024947250838, + "grad_norm": 0.1315311127735872, + "learning_rate": 2.1000354711798258e-06, + "loss": 0.7197, + "step": 4449 + }, + { + "epoch": 2.207521409954077, + "grad_norm": 0.1304635101926576, + "learning_rate": 2.0990707410712647e-06, + "loss": 0.7177, + "step": 4450 + }, + { + "epoch": 2.2080178726573165, + "grad_norm": 0.12566986860796162, + "learning_rate": 2.098106072238313e-06, + "loss": 0.7147, + "step": 4451 + }, + { + "epoch": 2.208514335360556, + "grad_norm": 0.1291854407145579, + "learning_rate": 2.097141464828403e-06, + "loss": 0.6987, + "step": 4452 + }, + { + "epoch": 2.2090107980637956, + "grad_norm": 0.12932256073625428, + "learning_rate": 2.0961769189889612e-06, + "loss": 0.6682, + "step": 4453 + }, + { + "epoch": 2.209507260767035, + "grad_norm": 0.13534326258562565, + "learning_rate": 2.0952124348674027e-06, + "loss": 0.7585, + "step": 4454 + }, + { + "epoch": 2.2100037234702743, + "grad_norm": 0.13139570296537265, + "learning_rate": 2.094248012611133e-06, + "loss": 0.7238, + "step": 4455 + }, + { + "epoch": 2.2105001861735136, + "grad_norm": 0.12736205425045874, + "learning_rate": 2.0932836523675495e-06, + "loss": 0.7331, + "step": 4456 + }, + { + "epoch": 2.210996648876753, + "grad_norm": 0.1285738140080393, + "learning_rate": 2.0923193542840376e-06, + "loss": 0.6831, + "step": 4457 + }, + { + "epoch": 2.2114931115799927, + "grad_norm": 0.1373192184817093, + "learning_rate": 2.0913551185079763e-06, + "loss": 0.7511, + "step": 4458 + }, + { + "epoch": 2.211989574283232, + "grad_norm": 0.1315083921749297, + "learning_rate": 2.0903909451867327e-06, + "loss": 0.7186, + "step": 4459 + }, + { + "epoch": 2.2124860369864714, + "grad_norm": 0.13197214005113778, + "learning_rate": 2.089426834467666e-06, + "loss": 0.7278, + "step": 4460 + }, + { + "epoch": 2.2129824996897107, + "grad_norm": 0.13117095945392646, + "learning_rate": 2.0884627864981247e-06, + "loss": 0.7158, + "step": 4461 + }, + { + "epoch": 2.21347896239295, + "grad_norm": 0.14366920819680415, + "learning_rate": 2.0874988014254474e-06, + "loss": 0.697, + "step": 4462 + }, + { + "epoch": 2.21397542509619, + "grad_norm": 0.13103101512448223, + "learning_rate": 2.0865348793969644e-06, + "loss": 0.7099, + "step": 4463 + }, + { + "epoch": 2.214471887799429, + "grad_norm": 0.13241402045221123, + "learning_rate": 2.085571020559997e-06, + "loss": 0.6921, + "step": 4464 + }, + { + "epoch": 2.2149683505026685, + "grad_norm": 0.12588765448059275, + "learning_rate": 2.084607225061853e-06, + "loss": 0.6612, + "step": 4465 + }, + { + "epoch": 2.215464813205908, + "grad_norm": 0.1310167292666887, + "learning_rate": 2.0836434930498343e-06, + "loss": 0.6417, + "step": 4466 + }, + { + "epoch": 2.215961275909147, + "grad_norm": 0.13194158154041208, + "learning_rate": 2.082679824671232e-06, + "loss": 0.7063, + "step": 4467 + }, + { + "epoch": 2.216457738612387, + "grad_norm": 0.13066331982217858, + "learning_rate": 2.0817162200733275e-06, + "loss": 0.6948, + "step": 4468 + }, + { + "epoch": 2.2169542013156263, + "grad_norm": 0.1318788924378777, + "learning_rate": 2.080752679403392e-06, + "loss": 0.7012, + "step": 4469 + }, + { + "epoch": 2.2174506640188656, + "grad_norm": 0.13209070722048075, + "learning_rate": 2.0797892028086873e-06, + "loss": 0.7201, + "step": 4470 + }, + { + "epoch": 2.217947126722105, + "grad_norm": 0.1274600384445204, + "learning_rate": 2.078825790436465e-06, + "loss": 0.7131, + "step": 4471 + }, + { + "epoch": 2.2184435894253443, + "grad_norm": 0.13371229036895962, + "learning_rate": 2.0778624424339684e-06, + "loss": 0.7382, + "step": 4472 + }, + { + "epoch": 2.218940052128584, + "grad_norm": 0.13862248032289534, + "learning_rate": 2.0768991589484284e-06, + "loss": 0.7566, + "step": 4473 + }, + { + "epoch": 2.2194365148318234, + "grad_norm": 0.13523476510615928, + "learning_rate": 2.0759359401270683e-06, + "loss": 0.7046, + "step": 4474 + }, + { + "epoch": 2.2199329775350627, + "grad_norm": 0.13349052506342798, + "learning_rate": 2.0749727861171e-06, + "loss": 0.7179, + "step": 4475 + }, + { + "epoch": 2.220429440238302, + "grad_norm": 0.12706375105277193, + "learning_rate": 2.074009697065727e-06, + "loss": 0.7001, + "step": 4476 + }, + { + "epoch": 2.2209259029415414, + "grad_norm": 0.13175995019806763, + "learning_rate": 2.073046673120142e-06, + "loss": 0.7124, + "step": 4477 + }, + { + "epoch": 2.221422365644781, + "grad_norm": 0.12839478228159643, + "learning_rate": 2.0720837144275264e-06, + "loss": 0.6965, + "step": 4478 + }, + { + "epoch": 2.2219188283480205, + "grad_norm": 0.13013639445767175, + "learning_rate": 2.0711208211350543e-06, + "loss": 0.6632, + "step": 4479 + }, + { + "epoch": 2.22241529105126, + "grad_norm": 0.13013755734088792, + "learning_rate": 2.070157993389889e-06, + "loss": 0.6993, + "step": 4480 + }, + { + "epoch": 2.222911753754499, + "grad_norm": 0.12451994734244691, + "learning_rate": 2.069195231339182e-06, + "loss": 0.6995, + "step": 4481 + }, + { + "epoch": 2.2234082164577385, + "grad_norm": 0.13477779261267323, + "learning_rate": 2.0682325351300754e-06, + "loss": 0.6804, + "step": 4482 + }, + { + "epoch": 2.2239046791609782, + "grad_norm": 0.12668543781608235, + "learning_rate": 2.0672699049097034e-06, + "loss": 0.7061, + "step": 4483 + }, + { + "epoch": 2.2244011418642176, + "grad_norm": 0.13314313496622696, + "learning_rate": 2.0663073408251888e-06, + "loss": 0.7014, + "step": 4484 + }, + { + "epoch": 2.224897604567457, + "grad_norm": 0.12970701923177527, + "learning_rate": 2.065344843023643e-06, + "loss": 0.68, + "step": 4485 + }, + { + "epoch": 2.225394067270696, + "grad_norm": 0.12387800818585266, + "learning_rate": 2.0643824116521683e-06, + "loss": 0.6338, + "step": 4486 + }, + { + "epoch": 2.2258905299739355, + "grad_norm": 0.13553410886475062, + "learning_rate": 2.0634200468578577e-06, + "loss": 0.6881, + "step": 4487 + }, + { + "epoch": 2.2263869926771753, + "grad_norm": 0.1316309439657593, + "learning_rate": 2.0624577487877934e-06, + "loss": 0.7049, + "step": 4488 + }, + { + "epoch": 2.2268834553804147, + "grad_norm": 0.12922645687790163, + "learning_rate": 2.0614955175890464e-06, + "loss": 0.7002, + "step": 4489 + }, + { + "epoch": 2.227379918083654, + "grad_norm": 0.13080644734923877, + "learning_rate": 2.0605333534086783e-06, + "loss": 0.7158, + "step": 4490 + }, + { + "epoch": 2.2278763807868933, + "grad_norm": 0.13245492757085336, + "learning_rate": 2.0595712563937412e-06, + "loss": 0.749, + "step": 4491 + }, + { + "epoch": 2.2283728434901326, + "grad_norm": 0.13270571932773106, + "learning_rate": 2.0586092266912753e-06, + "loss": 0.7214, + "step": 4492 + }, + { + "epoch": 2.2288693061933724, + "grad_norm": 0.13643011762485555, + "learning_rate": 2.0576472644483133e-06, + "loss": 0.7247, + "step": 4493 + }, + { + "epoch": 2.2293657688966118, + "grad_norm": 0.13005640680659797, + "learning_rate": 2.056685369811873e-06, + "loss": 0.7014, + "step": 4494 + }, + { + "epoch": 2.229862231599851, + "grad_norm": 0.13129470666359272, + "learning_rate": 2.055723542928966e-06, + "loss": 0.7518, + "step": 4495 + }, + { + "epoch": 2.2303586943030904, + "grad_norm": 0.13311828367670825, + "learning_rate": 2.0547617839465925e-06, + "loss": 0.7229, + "step": 4496 + }, + { + "epoch": 2.2308551570063297, + "grad_norm": 0.12371038963722301, + "learning_rate": 2.0538000930117424e-06, + "loss": 0.6783, + "step": 4497 + }, + { + "epoch": 2.2313516197095695, + "grad_norm": 0.1292453879856836, + "learning_rate": 2.0528384702713924e-06, + "loss": 0.6828, + "step": 4498 + }, + { + "epoch": 2.231848082412809, + "grad_norm": 0.13392931788982132, + "learning_rate": 2.0518769158725126e-06, + "loss": 0.7158, + "step": 4499 + }, + { + "epoch": 2.232344545116048, + "grad_norm": 0.13236176803456814, + "learning_rate": 2.0509154299620622e-06, + "loss": 0.7292, + "step": 4500 + }, + { + "epoch": 2.2328410078192875, + "grad_norm": 0.12728431676565705, + "learning_rate": 2.0499540126869864e-06, + "loss": 0.709, + "step": 4501 + }, + { + "epoch": 2.233337470522527, + "grad_norm": 0.13394951954145282, + "learning_rate": 2.0489926641942245e-06, + "loss": 0.7129, + "step": 4502 + }, + { + "epoch": 2.2338339332257666, + "grad_norm": 0.12837119042814552, + "learning_rate": 2.0480313846307025e-06, + "loss": 0.6862, + "step": 4503 + }, + { + "epoch": 2.234330395929006, + "grad_norm": 0.13885619880171285, + "learning_rate": 2.047070174143337e-06, + "loss": 0.7047, + "step": 4504 + }, + { + "epoch": 2.2348268586322453, + "grad_norm": 0.13053786492237074, + "learning_rate": 2.0461090328790325e-06, + "loss": 0.741, + "step": 4505 + }, + { + "epoch": 2.2353233213354846, + "grad_norm": 0.12630369808853714, + "learning_rate": 2.0451479609846847e-06, + "loss": 0.6303, + "step": 4506 + }, + { + "epoch": 2.235819784038724, + "grad_norm": 0.1334775735012497, + "learning_rate": 2.0441869586071784e-06, + "loss": 0.7161, + "step": 4507 + }, + { + "epoch": 2.2363162467419633, + "grad_norm": 0.1243537069117441, + "learning_rate": 2.043226025893387e-06, + "loss": 0.7083, + "step": 4508 + }, + { + "epoch": 2.236812709445203, + "grad_norm": 0.1362055506738069, + "learning_rate": 2.0422651629901743e-06, + "loss": 0.7073, + "step": 4509 + }, + { + "epoch": 2.2373091721484424, + "grad_norm": 0.13381191349240082, + "learning_rate": 2.041304370044391e-06, + "loss": 0.715, + "step": 4510 + }, + { + "epoch": 2.2378056348516817, + "grad_norm": 0.13008180169114836, + "learning_rate": 2.0403436472028807e-06, + "loss": 0.6912, + "step": 4511 + }, + { + "epoch": 2.238302097554921, + "grad_norm": 0.13665438512065606, + "learning_rate": 2.0393829946124737e-06, + "loss": 0.7122, + "step": 4512 + }, + { + "epoch": 2.238798560258161, + "grad_norm": 0.1306034467966443, + "learning_rate": 2.0384224124199918e-06, + "loss": 0.6518, + "step": 4513 + }, + { + "epoch": 2.2392950229614, + "grad_norm": 0.14766441708462488, + "learning_rate": 2.0374619007722423e-06, + "loss": 0.75, + "step": 4514 + }, + { + "epoch": 2.2397914856646395, + "grad_norm": 0.14199023614242443, + "learning_rate": 2.036501459816025e-06, + "loss": 0.7057, + "step": 4515 + }, + { + "epoch": 2.240287948367879, + "grad_norm": 0.13183647650911845, + "learning_rate": 2.0355410896981285e-06, + "loss": 0.7167, + "step": 4516 + }, + { + "epoch": 2.240784411071118, + "grad_norm": 0.12935480831259677, + "learning_rate": 2.034580790565329e-06, + "loss": 0.6945, + "step": 4517 + }, + { + "epoch": 2.2412808737743575, + "grad_norm": 0.123741363188653, + "learning_rate": 2.033620562564393e-06, + "loss": 0.6593, + "step": 4518 + }, + { + "epoch": 2.2417773364775972, + "grad_norm": 0.1324876830108556, + "learning_rate": 2.032660405842076e-06, + "loss": 0.7795, + "step": 4519 + }, + { + "epoch": 2.2422737991808366, + "grad_norm": 0.13082736167618114, + "learning_rate": 2.031700320545123e-06, + "loss": 0.6666, + "step": 4520 + }, + { + "epoch": 2.242770261884076, + "grad_norm": 0.13454831662707822, + "learning_rate": 2.0307403068202677e-06, + "loss": 0.6773, + "step": 4521 + }, + { + "epoch": 2.2432667245873152, + "grad_norm": 0.13082635983605484, + "learning_rate": 2.0297803648142324e-06, + "loss": 0.6761, + "step": 4522 + }, + { + "epoch": 2.243763187290555, + "grad_norm": 0.1300274112305257, + "learning_rate": 2.0288204946737283e-06, + "loss": 0.6994, + "step": 4523 + }, + { + "epoch": 2.2442596499937943, + "grad_norm": 0.13137498359275115, + "learning_rate": 2.0278606965454573e-06, + "loss": 0.7014, + "step": 4524 + }, + { + "epoch": 2.2447561126970337, + "grad_norm": 0.1380783519062254, + "learning_rate": 2.026900970576109e-06, + "loss": 0.7264, + "step": 4525 + }, + { + "epoch": 2.245252575400273, + "grad_norm": 0.1334871254288439, + "learning_rate": 2.0259413169123615e-06, + "loss": 0.7035, + "step": 4526 + }, + { + "epoch": 2.2457490381035123, + "grad_norm": 0.13467702526258202, + "learning_rate": 2.0249817357008825e-06, + "loss": 0.7435, + "step": 4527 + }, + { + "epoch": 2.2462455008067517, + "grad_norm": 0.13343539696190834, + "learning_rate": 2.024022227088329e-06, + "loss": 0.766, + "step": 4528 + }, + { + "epoch": 2.2467419635099914, + "grad_norm": 0.1336433964166616, + "learning_rate": 2.0230627912213475e-06, + "loss": 0.7278, + "step": 4529 + }, + { + "epoch": 2.2472384262132308, + "grad_norm": 0.13069507120960716, + "learning_rate": 2.02210342824657e-06, + "loss": 0.6983, + "step": 4530 + }, + { + "epoch": 2.24773488891647, + "grad_norm": 0.13107224337831874, + "learning_rate": 2.0211441383106208e-06, + "loss": 0.677, + "step": 4531 + }, + { + "epoch": 2.2482313516197094, + "grad_norm": 0.13776105234734956, + "learning_rate": 2.020184921560113e-06, + "loss": 0.7045, + "step": 4532 + }, + { + "epoch": 2.248727814322949, + "grad_norm": 0.12525227813051776, + "learning_rate": 2.019225778141646e-06, + "loss": 0.6817, + "step": 4533 + }, + { + "epoch": 2.2492242770261885, + "grad_norm": 0.13625075248045249, + "learning_rate": 2.0182667082018104e-06, + "loss": 0.7373, + "step": 4534 + }, + { + "epoch": 2.249720739729428, + "grad_norm": 0.13175367979094055, + "learning_rate": 2.0173077118871847e-06, + "loss": 0.7571, + "step": 4535 + }, + { + "epoch": 2.250217202432667, + "grad_norm": 0.1313325217650813, + "learning_rate": 2.016348789344335e-06, + "loss": 0.6996, + "step": 4536 + }, + { + "epoch": 2.250217202432667, + "eval_loss": 0.7270435690879822, + "eval_runtime": 141.8878, + "eval_samples_per_second": 213.923, + "eval_steps_per_second": 26.746, + "step": 4536 + }, + { + "epoch": 2.2507136651359065, + "grad_norm": 0.130395751964868, + "learning_rate": 2.0153899407198193e-06, + "loss": 0.7365, + "step": 4537 + }, + { + "epoch": 2.251210127839146, + "grad_norm": 0.133483489650425, + "learning_rate": 2.01443116616018e-06, + "loss": 0.7063, + "step": 4538 + }, + { + "epoch": 2.2517065905423856, + "grad_norm": 0.12958543122285368, + "learning_rate": 2.0134724658119525e-06, + "loss": 0.7304, + "step": 4539 + }, + { + "epoch": 2.252203053245625, + "grad_norm": 0.13021997063309637, + "learning_rate": 2.012513839821657e-06, + "loss": 0.6835, + "step": 4540 + }, + { + "epoch": 2.2526995159488643, + "grad_norm": 0.1327109634903606, + "learning_rate": 2.011555288335805e-06, + "loss": 0.7004, + "step": 4541 + }, + { + "epoch": 2.2531959786521036, + "grad_norm": 0.13418960633883067, + "learning_rate": 2.0105968115008957e-06, + "loss": 0.6589, + "step": 4542 + }, + { + "epoch": 2.2536924413553434, + "grad_norm": 0.134302800194372, + "learning_rate": 2.0096384094634165e-06, + "loss": 0.7305, + "step": 4543 + }, + { + "epoch": 2.2541889040585827, + "grad_norm": 0.1313733378397713, + "learning_rate": 2.0086800823698437e-06, + "loss": 0.7205, + "step": 4544 + }, + { + "epoch": 2.254685366761822, + "grad_norm": 0.13297045310339728, + "learning_rate": 2.007721830366644e-06, + "loss": 0.7046, + "step": 4545 + }, + { + "epoch": 2.2551818294650614, + "grad_norm": 0.1336572863632231, + "learning_rate": 2.0067636536002687e-06, + "loss": 0.6846, + "step": 4546 + }, + { + "epoch": 2.2556782921683007, + "grad_norm": 0.13982170058768661, + "learning_rate": 2.00580555221716e-06, + "loss": 0.7349, + "step": 4547 + }, + { + "epoch": 2.25617475487154, + "grad_norm": 0.13758235182829331, + "learning_rate": 2.0048475263637495e-06, + "loss": 0.7076, + "step": 4548 + }, + { + "epoch": 2.25667121757478, + "grad_norm": 0.13023416929452136, + "learning_rate": 2.003889576186455e-06, + "loss": 0.7703, + "step": 4549 + }, + { + "epoch": 2.257167680278019, + "grad_norm": 0.12888207678478222, + "learning_rate": 2.002931701831684e-06, + "loss": 0.7119, + "step": 4550 + }, + { + "epoch": 2.2576641429812585, + "grad_norm": 0.12816010732415842, + "learning_rate": 2.0019739034458328e-06, + "loss": 0.725, + "step": 4551 + }, + { + "epoch": 2.258160605684498, + "grad_norm": 0.14248392567472656, + "learning_rate": 2.0010161811752856e-06, + "loss": 0.7778, + "step": 4552 + }, + { + "epoch": 2.2586570683877376, + "grad_norm": 0.13798620466349681, + "learning_rate": 2.000058535166414e-06, + "loss": 0.6932, + "step": 4553 + }, + { + "epoch": 2.259153531090977, + "grad_norm": 0.1329997986135555, + "learning_rate": 1.9991009655655796e-06, + "loss": 0.6758, + "step": 4554 + }, + { + "epoch": 2.2596499937942163, + "grad_norm": 0.13463323485659834, + "learning_rate": 1.9981434725191314e-06, + "loss": 0.7121, + "step": 4555 + }, + { + "epoch": 2.2601464564974556, + "grad_norm": 0.12854539155868666, + "learning_rate": 1.9971860561734062e-06, + "loss": 0.6769, + "step": 4556 + }, + { + "epoch": 2.260642919200695, + "grad_norm": 0.1282085935483695, + "learning_rate": 1.9962287166747304e-06, + "loss": 0.6794, + "step": 4557 + }, + { + "epoch": 2.2611393819039343, + "grad_norm": 0.1294147737679884, + "learning_rate": 1.9952714541694186e-06, + "loss": 0.7308, + "step": 4558 + }, + { + "epoch": 2.261635844607174, + "grad_norm": 0.13473387061029163, + "learning_rate": 1.994314268803772e-06, + "loss": 0.7255, + "step": 4559 + }, + { + "epoch": 2.2621323073104134, + "grad_norm": 0.1325041645843652, + "learning_rate": 1.993357160724081e-06, + "loss": 0.697, + "step": 4560 + }, + { + "epoch": 2.2626287700136527, + "grad_norm": 0.1342544420441968, + "learning_rate": 1.9924001300766256e-06, + "loss": 0.7283, + "step": 4561 + }, + { + "epoch": 2.263125232716892, + "grad_norm": 0.13379243253594394, + "learning_rate": 1.9914431770076707e-06, + "loss": 0.7161, + "step": 4562 + }, + { + "epoch": 2.263621695420132, + "grad_norm": 0.1319578992058116, + "learning_rate": 1.9904863016634724e-06, + "loss": 0.7133, + "step": 4563 + }, + { + "epoch": 2.264118158123371, + "grad_norm": 0.13825130015364384, + "learning_rate": 1.9895295041902733e-06, + "loss": 0.7064, + "step": 4564 + }, + { + "epoch": 2.2646146208266105, + "grad_norm": 0.1261519713290881, + "learning_rate": 1.988572784734305e-06, + "loss": 0.7139, + "step": 4565 + }, + { + "epoch": 2.26511108352985, + "grad_norm": 0.13132799813077256, + "learning_rate": 1.9876161434417857e-06, + "loss": 0.6944, + "step": 4566 + }, + { + "epoch": 2.265607546233089, + "grad_norm": 0.13312127226666143, + "learning_rate": 1.986659580458924e-06, + "loss": 0.7534, + "step": 4567 + }, + { + "epoch": 2.2661040089363285, + "grad_norm": 0.13300727555740405, + "learning_rate": 1.9857030959319143e-06, + "loss": 0.6843, + "step": 4568 + }, + { + "epoch": 2.2666004716395682, + "grad_norm": 0.12803205729134937, + "learning_rate": 1.984746690006941e-06, + "loss": 0.6661, + "step": 4569 + }, + { + "epoch": 2.2670969343428076, + "grad_norm": 0.13134108543944287, + "learning_rate": 1.983790362830174e-06, + "loss": 0.715, + "step": 4570 + }, + { + "epoch": 2.267593397046047, + "grad_norm": 0.1310214195071098, + "learning_rate": 1.982834114547773e-06, + "loss": 0.6808, + "step": 4571 + }, + { + "epoch": 2.2680898597492862, + "grad_norm": 0.12868216227851317, + "learning_rate": 1.981877945305886e-06, + "loss": 0.6859, + "step": 4572 + }, + { + "epoch": 2.2685863224525256, + "grad_norm": 0.1308307615447908, + "learning_rate": 1.980921855250647e-06, + "loss": 0.706, + "step": 4573 + }, + { + "epoch": 2.2690827851557653, + "grad_norm": 0.13070485382531827, + "learning_rate": 1.979965844528181e-06, + "loss": 0.7233, + "step": 4574 + }, + { + "epoch": 2.2695792478590047, + "grad_norm": 0.13488405502043913, + "learning_rate": 1.979009913284596e-06, + "loss": 0.6707, + "step": 4575 + }, + { + "epoch": 2.270075710562244, + "grad_norm": 0.1357162099459169, + "learning_rate": 1.978054061665993e-06, + "loss": 0.7506, + "step": 4576 + }, + { + "epoch": 2.2705721732654833, + "grad_norm": 0.1324174585525689, + "learning_rate": 1.977098289818459e-06, + "loss": 0.7365, + "step": 4577 + }, + { + "epoch": 2.2710686359687227, + "grad_norm": 0.13116743586662247, + "learning_rate": 1.976142597888066e-06, + "loss": 0.6689, + "step": 4578 + }, + { + "epoch": 2.2715650986719624, + "grad_norm": 0.13835608552718603, + "learning_rate": 1.9751869860208774e-06, + "loss": 0.706, + "step": 4579 + }, + { + "epoch": 2.2720615613752018, + "grad_norm": 0.13412470397613815, + "learning_rate": 1.974231454362944e-06, + "loss": 0.7028, + "step": 4580 + }, + { + "epoch": 2.272558024078441, + "grad_norm": 0.12973612445352137, + "learning_rate": 1.973276003060302e-06, + "loss": 0.7142, + "step": 4581 + }, + { + "epoch": 2.2730544867816804, + "grad_norm": 0.12782910807584713, + "learning_rate": 1.972320632258978e-06, + "loss": 0.6797, + "step": 4582 + }, + { + "epoch": 2.2735509494849198, + "grad_norm": 0.12805462086856045, + "learning_rate": 1.9713653421049844e-06, + "loss": 0.6942, + "step": 4583 + }, + { + "epoch": 2.2740474121881595, + "grad_norm": 0.13882829796209556, + "learning_rate": 1.970410132744322e-06, + "loss": 0.6987, + "step": 4584 + }, + { + "epoch": 2.274543874891399, + "grad_norm": 0.13480442463376907, + "learning_rate": 1.9694550043229802e-06, + "loss": 0.7136, + "step": 4585 + }, + { + "epoch": 2.275040337594638, + "grad_norm": 0.12813305978151937, + "learning_rate": 1.968499956986934e-06, + "loss": 0.6925, + "step": 4586 + }, + { + "epoch": 2.2755368002978775, + "grad_norm": 0.13565940409117114, + "learning_rate": 1.9675449908821473e-06, + "loss": 0.7073, + "step": 4587 + }, + { + "epoch": 2.276033263001117, + "grad_norm": 0.13297564484436258, + "learning_rate": 1.9665901061545715e-06, + "loss": 0.7146, + "step": 4588 + }, + { + "epoch": 2.2765297257043566, + "grad_norm": 0.1292833330312642, + "learning_rate": 1.965635302950145e-06, + "loss": 0.6681, + "step": 4589 + }, + { + "epoch": 2.277026188407596, + "grad_norm": 0.13161230553838985, + "learning_rate": 1.9646805814147956e-06, + "loss": 0.7131, + "step": 4590 + }, + { + "epoch": 2.2775226511108353, + "grad_norm": 0.13259185019815198, + "learning_rate": 1.9637259416944352e-06, + "loss": 0.7156, + "step": 4591 + }, + { + "epoch": 2.2780191138140746, + "grad_norm": 0.13109212966194678, + "learning_rate": 1.9627713839349665e-06, + "loss": 0.7343, + "step": 4592 + }, + { + "epoch": 2.278515576517314, + "grad_norm": 0.13887226544683762, + "learning_rate": 1.961816908282279e-06, + "loss": 0.7579, + "step": 4593 + }, + { + "epoch": 2.2790120392205537, + "grad_norm": 0.13379712126091625, + "learning_rate": 1.960862514882247e-06, + "loss": 0.741, + "step": 4594 + }, + { + "epoch": 2.279508501923793, + "grad_norm": 0.12731411318156655, + "learning_rate": 1.959908203880735e-06, + "loss": 0.6717, + "step": 4595 + }, + { + "epoch": 2.2800049646270324, + "grad_norm": 0.13191309058764128, + "learning_rate": 1.9589539754235938e-06, + "loss": 0.6887, + "step": 4596 + }, + { + "epoch": 2.2805014273302717, + "grad_norm": 0.13477829801291258, + "learning_rate": 1.957999829656664e-06, + "loss": 0.7024, + "step": 4597 + }, + { + "epoch": 2.280997890033511, + "grad_norm": 0.1408048819606926, + "learning_rate": 1.9570457667257686e-06, + "loss": 0.736, + "step": 4598 + }, + { + "epoch": 2.281494352736751, + "grad_norm": 0.13409166712691484, + "learning_rate": 1.9560917867767223e-06, + "loss": 0.7182, + "step": 4599 + }, + { + "epoch": 2.28199081543999, + "grad_norm": 0.1323033246480032, + "learning_rate": 1.9551378899553255e-06, + "loss": 0.6757, + "step": 4600 + }, + { + "epoch": 2.2824872781432295, + "grad_norm": 0.1279575053978906, + "learning_rate": 1.9541840764073666e-06, + "loss": 0.6639, + "step": 4601 + }, + { + "epoch": 2.282983740846469, + "grad_norm": 0.1284865860394091, + "learning_rate": 1.9532303462786196e-06, + "loss": 0.7076, + "step": 4602 + }, + { + "epoch": 2.283480203549708, + "grad_norm": 0.13236730912378264, + "learning_rate": 1.9522766997148473e-06, + "loss": 0.7177, + "step": 4603 + }, + { + "epoch": 2.283976666252948, + "grad_norm": 0.13334477268216768, + "learning_rate": 1.9513231368617996e-06, + "loss": 0.7152, + "step": 4604 + }, + { + "epoch": 2.2844731289561873, + "grad_norm": 0.129554016821733, + "learning_rate": 1.950369657865213e-06, + "loss": 0.7274, + "step": 4605 + }, + { + "epoch": 2.2849695916594266, + "grad_norm": 0.12809936242000544, + "learning_rate": 1.949416262870812e-06, + "loss": 0.6868, + "step": 4606 + }, + { + "epoch": 2.285466054362666, + "grad_norm": 0.12619361467512472, + "learning_rate": 1.948462952024307e-06, + "loss": 0.6536, + "step": 4607 + }, + { + "epoch": 2.2859625170659053, + "grad_norm": 0.13019492596083582, + "learning_rate": 1.9475097254713963e-06, + "loss": 0.6921, + "step": 4608 + }, + { + "epoch": 2.286458979769145, + "grad_norm": 0.12712933900402623, + "learning_rate": 1.9465565833577667e-06, + "loss": 0.7071, + "step": 4609 + }, + { + "epoch": 2.2869554424723844, + "grad_norm": 0.13073678098550098, + "learning_rate": 1.9456035258290886e-06, + "loss": 0.7274, + "step": 4610 + }, + { + "epoch": 2.2874519051756237, + "grad_norm": 0.13476232848198935, + "learning_rate": 1.944650553031022e-06, + "loss": 0.7651, + "step": 4611 + }, + { + "epoch": 2.287948367878863, + "grad_norm": 0.12940407399602424, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.7052, + "step": 4612 + }, + { + "epoch": 2.2884448305821024, + "grad_norm": 0.12521819355840508, + "learning_rate": 1.9427448622092997e-06, + "loss": 0.6643, + "step": 4613 + }, + { + "epoch": 2.288941293285342, + "grad_norm": 0.1364514191356369, + "learning_rate": 1.941792144476897e-06, + "loss": 0.6871, + "step": 4614 + }, + { + "epoch": 2.2894377559885815, + "grad_norm": 0.12501502708363635, + "learning_rate": 1.940839512057615e-06, + "loss": 0.6713, + "step": 4615 + }, + { + "epoch": 2.289934218691821, + "grad_norm": 0.13288530127304785, + "learning_rate": 1.9398869650970483e-06, + "loss": 0.7355, + "step": 4616 + }, + { + "epoch": 2.29043068139506, + "grad_norm": 0.1356585375859948, + "learning_rate": 1.9389345037407787e-06, + "loss": 0.7225, + "step": 4617 + }, + { + "epoch": 2.2909271440982995, + "grad_norm": 0.13053848389346934, + "learning_rate": 1.9379821281343736e-06, + "loss": 0.7053, + "step": 4618 + }, + { + "epoch": 2.2914236068015392, + "grad_norm": 0.13397921868792434, + "learning_rate": 1.937029838423389e-06, + "loss": 0.7573, + "step": 4619 + }, + { + "epoch": 2.2919200695047786, + "grad_norm": 0.13057634121314804, + "learning_rate": 1.9360776347533667e-06, + "loss": 0.7348, + "step": 4620 + }, + { + "epoch": 2.292416532208018, + "grad_norm": 0.12556161832630441, + "learning_rate": 1.9351255172698368e-06, + "loss": 0.6724, + "step": 4621 + }, + { + "epoch": 2.292912994911257, + "grad_norm": 0.13237700928498475, + "learning_rate": 1.9341734861183146e-06, + "loss": 0.7015, + "step": 4622 + }, + { + "epoch": 2.2934094576144965, + "grad_norm": 0.12996320795796523, + "learning_rate": 1.9332215414443023e-06, + "loss": 0.6838, + "step": 4623 + }, + { + "epoch": 2.2939059203177363, + "grad_norm": 0.1264053811415136, + "learning_rate": 1.9322696833932896e-06, + "loss": 0.672, + "step": 4624 + }, + { + "epoch": 2.2944023830209757, + "grad_norm": 0.13696458123198366, + "learning_rate": 1.931317912110754e-06, + "loss": 0.7194, + "step": 4625 + }, + { + "epoch": 2.294898845724215, + "grad_norm": 0.12935669893699142, + "learning_rate": 1.930366227742157e-06, + "loss": 0.7196, + "step": 4626 + }, + { + "epoch": 2.2953953084274543, + "grad_norm": 0.13011349995194715, + "learning_rate": 1.9294146304329482e-06, + "loss": 0.7146, + "step": 4627 + }, + { + "epoch": 2.2958917711306936, + "grad_norm": 0.132257641760658, + "learning_rate": 1.9284631203285644e-06, + "loss": 0.7539, + "step": 4628 + }, + { + "epoch": 2.296388233833933, + "grad_norm": 0.12618209390437157, + "learning_rate": 1.927511697574429e-06, + "loss": 0.6542, + "step": 4629 + }, + { + "epoch": 2.2968846965371728, + "grad_norm": 0.13076964633199217, + "learning_rate": 1.9265603623159517e-06, + "loss": 0.736, + "step": 4630 + }, + { + "epoch": 2.297381159240412, + "grad_norm": 0.1278597050601475, + "learning_rate": 1.9256091146985282e-06, + "loss": 0.7204, + "step": 4631 + }, + { + "epoch": 2.2978776219436514, + "grad_norm": 0.12737458954653222, + "learning_rate": 1.924657954867542e-06, + "loss": 0.6605, + "step": 4632 + }, + { + "epoch": 2.2983740846468907, + "grad_norm": 0.12824874951751916, + "learning_rate": 1.923706882968362e-06, + "loss": 0.6611, + "step": 4633 + }, + { + "epoch": 2.2988705473501305, + "grad_norm": 0.13788032532644687, + "learning_rate": 1.922755899146346e-06, + "loss": 0.7261, + "step": 4634 + }, + { + "epoch": 2.29936701005337, + "grad_norm": 0.13925532124868073, + "learning_rate": 1.9218050035468343e-06, + "loss": 0.6746, + "step": 4635 + }, + { + "epoch": 2.299863472756609, + "grad_norm": 0.13354164221145085, + "learning_rate": 1.9208541963151576e-06, + "loss": 0.6896, + "step": 4636 + }, + { + "epoch": 2.3003599354598485, + "grad_norm": 0.13161140898185, + "learning_rate": 1.919903477596631e-06, + "loss": 0.6969, + "step": 4637 + }, + { + "epoch": 2.300856398163088, + "grad_norm": 0.13142493984817077, + "learning_rate": 1.918952847536557e-06, + "loss": 0.6899, + "step": 4638 + }, + { + "epoch": 2.301352860866327, + "grad_norm": 0.12827685841442193, + "learning_rate": 1.9180023062802237e-06, + "loss": 0.6423, + "step": 4639 + }, + { + "epoch": 2.301849323569567, + "grad_norm": 0.13137125453834245, + "learning_rate": 1.9170518539729063e-06, + "loss": 0.6769, + "step": 4640 + }, + { + "epoch": 2.3023457862728063, + "grad_norm": 0.1256889202189166, + "learning_rate": 1.9161014907598668e-06, + "loss": 0.6618, + "step": 4641 + }, + { + "epoch": 2.3028422489760456, + "grad_norm": 0.12999406754723933, + "learning_rate": 1.915151216786352e-06, + "loss": 0.7263, + "step": 4642 + }, + { + "epoch": 2.303338711679285, + "grad_norm": 0.12802875535724298, + "learning_rate": 1.9142010321975956e-06, + "loss": 0.6686, + "step": 4643 + }, + { + "epoch": 2.3038351743825247, + "grad_norm": 0.13208418783479883, + "learning_rate": 1.9132509371388187e-06, + "loss": 0.7076, + "step": 4644 + }, + { + "epoch": 2.304331637085764, + "grad_norm": 0.13301405076047154, + "learning_rate": 1.9123009317552294e-06, + "loss": 0.7435, + "step": 4645 + }, + { + "epoch": 2.3048280997890034, + "grad_norm": 0.13096465156829554, + "learning_rate": 1.911351016192019e-06, + "loss": 0.6746, + "step": 4646 + }, + { + "epoch": 2.3053245624922427, + "grad_norm": 0.1282318000984796, + "learning_rate": 1.9104011905943675e-06, + "loss": 0.6925, + "step": 4647 + }, + { + "epoch": 2.305821025195482, + "grad_norm": 0.1374326520419337, + "learning_rate": 1.9094514551074405e-06, + "loss": 0.721, + "step": 4648 + }, + { + "epoch": 2.3063174878987214, + "grad_norm": 0.13296090301658972, + "learning_rate": 1.9085018098763895e-06, + "loss": 0.7127, + "step": 4649 + }, + { + "epoch": 2.306813950601961, + "grad_norm": 0.12863266739621676, + "learning_rate": 1.9075522550463538e-06, + "loss": 0.6811, + "step": 4650 + }, + { + "epoch": 2.3073104133052005, + "grad_norm": 0.1292808442724112, + "learning_rate": 1.9066027907624563e-06, + "loss": 0.6921, + "step": 4651 + }, + { + "epoch": 2.30780687600844, + "grad_norm": 0.13198793172057285, + "learning_rate": 1.9056534171698076e-06, + "loss": 0.6872, + "step": 4652 + }, + { + "epoch": 2.308303338711679, + "grad_norm": 0.1308224331561388, + "learning_rate": 1.9047041344135045e-06, + "loss": 0.6865, + "step": 4653 + }, + { + "epoch": 2.308799801414919, + "grad_norm": 0.130383003378401, + "learning_rate": 1.9037549426386304e-06, + "loss": 0.7286, + "step": 4654 + }, + { + "epoch": 2.3092962641181582, + "grad_norm": 0.13067038581766985, + "learning_rate": 1.9028058419902524e-06, + "loss": 0.6786, + "step": 4655 + }, + { + "epoch": 2.3097927268213976, + "grad_norm": 0.1312758611395035, + "learning_rate": 1.9018568326134262e-06, + "loss": 0.6942, + "step": 4656 + }, + { + "epoch": 2.310289189524637, + "grad_norm": 0.13012405788871972, + "learning_rate": 1.900907914653194e-06, + "loss": 0.6997, + "step": 4657 + }, + { + "epoch": 2.3107856522278762, + "grad_norm": 0.13238886769700214, + "learning_rate": 1.8999590882545798e-06, + "loss": 0.7042, + "step": 4658 + }, + { + "epoch": 2.3112821149311156, + "grad_norm": 0.13003113363469201, + "learning_rate": 1.8990103535625983e-06, + "loss": 0.7026, + "step": 4659 + }, + { + "epoch": 2.3117785776343553, + "grad_norm": 0.12888802864649773, + "learning_rate": 1.8980617107222482e-06, + "loss": 0.6841, + "step": 4660 + }, + { + "epoch": 2.3122750403375947, + "grad_norm": 0.13431432633019263, + "learning_rate": 1.8971131598785148e-06, + "loss": 0.7537, + "step": 4661 + }, + { + "epoch": 2.312771503040834, + "grad_norm": 0.12849233342145183, + "learning_rate": 1.8961647011763676e-06, + "loss": 0.6768, + "step": 4662 + }, + { + "epoch": 2.3132679657440733, + "grad_norm": 0.13345147046633807, + "learning_rate": 1.8952163347607642e-06, + "loss": 0.7059, + "step": 4663 + }, + { + "epoch": 2.313764428447313, + "grad_norm": 0.13053349945942708, + "learning_rate": 1.894268060776647e-06, + "loss": 0.7291, + "step": 4664 + }, + { + "epoch": 2.3142608911505524, + "grad_norm": 0.1343964907723944, + "learning_rate": 1.8933198793689444e-06, + "loss": 0.7698, + "step": 4665 + }, + { + "epoch": 2.3147573538537918, + "grad_norm": 0.14226824478362488, + "learning_rate": 1.8923717906825718e-06, + "loss": 0.7851, + "step": 4666 + }, + { + "epoch": 2.315253816557031, + "grad_norm": 0.12925409867242452, + "learning_rate": 1.8914237948624275e-06, + "loss": 0.655, + "step": 4667 + }, + { + "epoch": 2.3157502792602704, + "grad_norm": 0.12816123194872267, + "learning_rate": 1.8904758920533988e-06, + "loss": 0.6711, + "step": 4668 + }, + { + "epoch": 2.3162467419635098, + "grad_norm": 0.1294204482164869, + "learning_rate": 1.8895280824003568e-06, + "loss": 0.6621, + "step": 4669 + }, + { + "epoch": 2.3167432046667495, + "grad_norm": 0.12776210547198066, + "learning_rate": 1.888580366048159e-06, + "loss": 0.7061, + "step": 4670 + }, + { + "epoch": 2.317239667369989, + "grad_norm": 0.12704629242381982, + "learning_rate": 1.8876327431416498e-06, + "loss": 0.7007, + "step": 4671 + }, + { + "epoch": 2.317736130073228, + "grad_norm": 0.1316157139453224, + "learning_rate": 1.8866852138256565e-06, + "loss": 0.7049, + "step": 4672 + }, + { + "epoch": 2.3182325927764675, + "grad_norm": 0.13375751981024753, + "learning_rate": 1.8857377782449956e-06, + "loss": 0.7209, + "step": 4673 + }, + { + "epoch": 2.3187290554797073, + "grad_norm": 0.13344576229065264, + "learning_rate": 1.8847904365444653e-06, + "loss": 0.7304, + "step": 4674 + }, + { + "epoch": 2.3192255181829466, + "grad_norm": 0.13003917367562956, + "learning_rate": 1.8838431888688528e-06, + "loss": 0.6972, + "step": 4675 + }, + { + "epoch": 2.319721980886186, + "grad_norm": 0.14062745708919816, + "learning_rate": 1.8828960353629294e-06, + "loss": 0.738, + "step": 4676 + }, + { + "epoch": 2.3202184435894253, + "grad_norm": 0.12901976125458806, + "learning_rate": 1.881948976171453e-06, + "loss": 0.6849, + "step": 4677 + }, + { + "epoch": 2.3207149062926646, + "grad_norm": 0.1277269250608724, + "learning_rate": 1.8810020114391653e-06, + "loss": 0.6756, + "step": 4678 + }, + { + "epoch": 2.321211368995904, + "grad_norm": 0.1291372202878994, + "learning_rate": 1.8800551413107955e-06, + "loss": 0.6776, + "step": 4679 + }, + { + "epoch": 2.3217078316991437, + "grad_norm": 0.12757692002915727, + "learning_rate": 1.8791083659310568e-06, + "loss": 0.6778, + "step": 4680 + }, + { + "epoch": 2.322204294402383, + "grad_norm": 0.1396567920307466, + "learning_rate": 1.8781616854446496e-06, + "loss": 0.7325, + "step": 4681 + }, + { + "epoch": 2.3227007571056224, + "grad_norm": 0.13455039781184047, + "learning_rate": 1.8772150999962588e-06, + "loss": 0.7038, + "step": 4682 + }, + { + "epoch": 2.3231972198088617, + "grad_norm": 0.13252407557746912, + "learning_rate": 1.8762686097305537e-06, + "loss": 0.6989, + "step": 4683 + }, + { + "epoch": 2.3236936825121015, + "grad_norm": 0.12477814641502248, + "learning_rate": 1.875322214792191e-06, + "loss": 0.6746, + "step": 4684 + }, + { + "epoch": 2.324190145215341, + "grad_norm": 0.1295558006823391, + "learning_rate": 1.8743759153258118e-06, + "loss": 0.7017, + "step": 4685 + }, + { + "epoch": 2.32468660791858, + "grad_norm": 0.13196900667132647, + "learning_rate": 1.8734297114760427e-06, + "loss": 0.7403, + "step": 4686 + }, + { + "epoch": 2.3251830706218195, + "grad_norm": 0.13187534712299806, + "learning_rate": 1.8724836033874966e-06, + "loss": 0.7402, + "step": 4687 + }, + { + "epoch": 2.325679533325059, + "grad_norm": 0.12450825043027855, + "learning_rate": 1.8715375912047695e-06, + "loss": 0.6719, + "step": 4688 + }, + { + "epoch": 2.326175996028298, + "grad_norm": 0.1264511706987313, + "learning_rate": 1.8705916750724462e-06, + "loss": 0.7006, + "step": 4689 + }, + { + "epoch": 2.326672458731538, + "grad_norm": 0.13114832252468836, + "learning_rate": 1.8696458551350927e-06, + "loss": 0.7447, + "step": 4690 + }, + { + "epoch": 2.3271689214347773, + "grad_norm": 0.1354210750717035, + "learning_rate": 1.868700131537263e-06, + "loss": 0.7688, + "step": 4691 + }, + { + "epoch": 2.3276653841380166, + "grad_norm": 0.1301186261261761, + "learning_rate": 1.8677545044234962e-06, + "loss": 0.7138, + "step": 4692 + }, + { + "epoch": 2.328161846841256, + "grad_norm": 0.13060257839716047, + "learning_rate": 1.8668089739383165e-06, + "loss": 0.7208, + "step": 4693 + }, + { + "epoch": 2.3286583095444957, + "grad_norm": 0.12633300107009157, + "learning_rate": 1.865863540226232e-06, + "loss": 0.6773, + "step": 4694 + }, + { + "epoch": 2.329154772247735, + "grad_norm": 0.12498631011162595, + "learning_rate": 1.8649182034317382e-06, + "loss": 0.693, + "step": 4695 + }, + { + "epoch": 2.3296512349509744, + "grad_norm": 0.13096791761906182, + "learning_rate": 1.863972963699314e-06, + "loss": 0.7156, + "step": 4696 + }, + { + "epoch": 2.3301476976542137, + "grad_norm": 0.1302613925721332, + "learning_rate": 1.8630278211734243e-06, + "loss": 0.7099, + "step": 4697 + }, + { + "epoch": 2.330644160357453, + "grad_norm": 0.13213767324984668, + "learning_rate": 1.86208277599852e-06, + "loss": 0.7447, + "step": 4698 + }, + { + "epoch": 2.3311406230606924, + "grad_norm": 0.1307800597646881, + "learning_rate": 1.861137828319034e-06, + "loss": 0.7238, + "step": 4699 + }, + { + "epoch": 2.331637085763932, + "grad_norm": 0.12668332112462782, + "learning_rate": 1.8601929782793882e-06, + "loss": 0.663, + "step": 4700 + }, + { + "epoch": 2.3321335484671715, + "grad_norm": 0.1299102574958521, + "learning_rate": 1.8592482260239869e-06, + "loss": 0.7101, + "step": 4701 + }, + { + "epoch": 2.332630011170411, + "grad_norm": 0.1263931699115363, + "learning_rate": 1.8583035716972203e-06, + "loss": 0.6929, + "step": 4702 + }, + { + "epoch": 2.33312647387365, + "grad_norm": 0.12904444206327081, + "learning_rate": 1.8573590154434648e-06, + "loss": 0.7177, + "step": 4703 + }, + { + "epoch": 2.33362293657689, + "grad_norm": 0.12940234345889146, + "learning_rate": 1.856414557407079e-06, + "loss": 0.6601, + "step": 4704 + }, + { + "epoch": 2.3341193992801292, + "grad_norm": 0.12912909817222648, + "learning_rate": 1.8554701977324104e-06, + "loss": 0.7058, + "step": 4705 + }, + { + "epoch": 2.3346158619833686, + "grad_norm": 0.13166188626901182, + "learning_rate": 1.8545259365637869e-06, + "loss": 0.6971, + "step": 4706 + }, + { + "epoch": 2.335112324686608, + "grad_norm": 0.13412107252812872, + "learning_rate": 1.8535817740455243e-06, + "loss": 0.7281, + "step": 4707 + }, + { + "epoch": 2.3356087873898472, + "grad_norm": 0.14219099357241585, + "learning_rate": 1.8526377103219228e-06, + "loss": 0.7635, + "step": 4708 + }, + { + "epoch": 2.3361052500930866, + "grad_norm": 0.12785200595648355, + "learning_rate": 1.851693745537268e-06, + "loss": 0.6974, + "step": 4709 + }, + { + "epoch": 2.3366017127963263, + "grad_norm": 0.1281849252246767, + "learning_rate": 1.8507498798358298e-06, + "loss": 0.7275, + "step": 4710 + }, + { + "epoch": 2.3370981754995657, + "grad_norm": 0.13346008136791995, + "learning_rate": 1.849806113361862e-06, + "loss": 0.7424, + "step": 4711 + }, + { + "epoch": 2.337594638202805, + "grad_norm": 0.12589143302285735, + "learning_rate": 1.8488624462596045e-06, + "loss": 0.7088, + "step": 4712 + }, + { + "epoch": 2.3380911009060443, + "grad_norm": 0.12714926877182972, + "learning_rate": 1.8479188786732821e-06, + "loss": 0.7187, + "step": 4713 + }, + { + "epoch": 2.3385875636092837, + "grad_norm": 0.1279972258903848, + "learning_rate": 1.8469754107471045e-06, + "loss": 0.6718, + "step": 4714 + }, + { + "epoch": 2.3390840263125234, + "grad_norm": 0.1312317942984953, + "learning_rate": 1.846032042625264e-06, + "loss": 0.7033, + "step": 4715 + }, + { + "epoch": 2.3395804890157628, + "grad_norm": 0.12873451062526245, + "learning_rate": 1.8450887744519402e-06, + "loss": 0.702, + "step": 4716 + }, + { + "epoch": 2.340076951719002, + "grad_norm": 0.12882507141223737, + "learning_rate": 1.844145606371297e-06, + "loss": 0.7278, + "step": 4717 + }, + { + "epoch": 2.3405734144222414, + "grad_norm": 0.12746855121924552, + "learning_rate": 1.8432025385274816e-06, + "loss": 0.6702, + "step": 4718 + }, + { + "epoch": 2.3410698771254808, + "grad_norm": 0.1279698618848009, + "learning_rate": 1.8422595710646279e-06, + "loss": 0.6851, + "step": 4719 + }, + { + "epoch": 2.3415663398287205, + "grad_norm": 0.13439282233196687, + "learning_rate": 1.841316704126852e-06, + "loss": 0.7056, + "step": 4720 + }, + { + "epoch": 2.34206280253196, + "grad_norm": 0.13075250402139188, + "learning_rate": 1.840373937858257e-06, + "loss": 0.7028, + "step": 4721 + }, + { + "epoch": 2.342559265235199, + "grad_norm": 0.13576682060101108, + "learning_rate": 1.8394312724029302e-06, + "loss": 0.715, + "step": 4722 + }, + { + "epoch": 2.3430557279384385, + "grad_norm": 0.13095475394049344, + "learning_rate": 1.838488707904941e-06, + "loss": 0.7263, + "step": 4723 + }, + { + "epoch": 2.343552190641678, + "grad_norm": 0.12252662103848144, + "learning_rate": 1.8375462445083464e-06, + "loss": 0.6212, + "step": 4724 + }, + { + "epoch": 2.3440486533449176, + "grad_norm": 0.1321014567093405, + "learning_rate": 1.8366038823571864e-06, + "loss": 0.6869, + "step": 4725 + }, + { + "epoch": 2.344545116048157, + "grad_norm": 0.12839430799368412, + "learning_rate": 1.835661621595487e-06, + "loss": 0.7022, + "step": 4726 + }, + { + "epoch": 2.3450415787513963, + "grad_norm": 0.13572608728421834, + "learning_rate": 1.834719462367256e-06, + "loss": 0.7124, + "step": 4727 + }, + { + "epoch": 2.3455380414546356, + "grad_norm": 0.1327198212125845, + "learning_rate": 1.8337774048164886e-06, + "loss": 0.7145, + "step": 4728 + }, + { + "epoch": 2.346034504157875, + "grad_norm": 0.13233453890145816, + "learning_rate": 1.8328354490871624e-06, + "loss": 0.723, + "step": 4729 + }, + { + "epoch": 2.3465309668611147, + "grad_norm": 0.1275819850450933, + "learning_rate": 1.8318935953232417e-06, + "loss": 0.6562, + "step": 4730 + }, + { + "epoch": 2.347027429564354, + "grad_norm": 0.12862371806632422, + "learning_rate": 1.8309518436686718e-06, + "loss": 0.6557, + "step": 4731 + }, + { + "epoch": 2.3475238922675934, + "grad_norm": 0.13020123965864694, + "learning_rate": 1.8300101942673854e-06, + "loss": 0.7, + "step": 4732 + }, + { + "epoch": 2.3480203549708327, + "grad_norm": 0.13526821353378768, + "learning_rate": 1.829068647263298e-06, + "loss": 0.7129, + "step": 4733 + }, + { + "epoch": 2.348516817674072, + "grad_norm": 0.12778442163510337, + "learning_rate": 1.8281272028003105e-06, + "loss": 0.6919, + "step": 4734 + }, + { + "epoch": 2.349013280377312, + "grad_norm": 0.12730442338186904, + "learning_rate": 1.827185861022308e-06, + "loss": 0.6737, + "step": 4735 + }, + { + "epoch": 2.349509743080551, + "grad_norm": 0.13135300464883143, + "learning_rate": 1.8262446220731583e-06, + "loss": 0.6902, + "step": 4736 + }, + { + "epoch": 2.3500062057837905, + "grad_norm": 0.13633030408853347, + "learning_rate": 1.8253034860967156e-06, + "loss": 0.7665, + "step": 4737 + }, + { + "epoch": 2.35050266848703, + "grad_norm": 0.12967623071202264, + "learning_rate": 1.8243624532368176e-06, + "loss": 0.6925, + "step": 4738 + }, + { + "epoch": 2.350999131190269, + "grad_norm": 0.12823110926047526, + "learning_rate": 1.823421523637285e-06, + "loss": 0.6446, + "step": 4739 + }, + { + "epoch": 2.351495593893509, + "grad_norm": 0.13140489998321872, + "learning_rate": 1.822480697441924e-06, + "loss": 0.736, + "step": 4740 + }, + { + "epoch": 2.3519920565967483, + "grad_norm": 0.12898773935905025, + "learning_rate": 1.8215399747945256e-06, + "loss": 0.7298, + "step": 4741 + }, + { + "epoch": 2.3524885192999876, + "grad_norm": 0.139264930199382, + "learning_rate": 1.820599355838864e-06, + "loss": 0.716, + "step": 4742 + }, + { + "epoch": 2.352984982003227, + "grad_norm": 0.12812055474218081, + "learning_rate": 1.8196588407186972e-06, + "loss": 0.7316, + "step": 4743 + }, + { + "epoch": 2.3534814447064663, + "grad_norm": 0.1305997310787121, + "learning_rate": 1.8187184295777676e-06, + "loss": 0.6861, + "step": 4744 + }, + { + "epoch": 2.353977907409706, + "grad_norm": 0.1353100470985911, + "learning_rate": 1.8177781225598033e-06, + "loss": 0.7478, + "step": 4745 + }, + { + "epoch": 2.3544743701129454, + "grad_norm": 0.13303661646904097, + "learning_rate": 1.816837919808514e-06, + "loss": 0.7119, + "step": 4746 + }, + { + "epoch": 2.3549708328161847, + "grad_norm": 0.13363770660484806, + "learning_rate": 1.8158978214675953e-06, + "loss": 0.673, + "step": 4747 + }, + { + "epoch": 2.355467295519424, + "grad_norm": 0.13089630550011988, + "learning_rate": 1.8149578276807257e-06, + "loss": 0.6883, + "step": 4748 + }, + { + "epoch": 2.3559637582226634, + "grad_norm": 0.13168358638427374, + "learning_rate": 1.814017938591568e-06, + "loss": 0.6873, + "step": 4749 + }, + { + "epoch": 2.356460220925903, + "grad_norm": 0.13484692782487573, + "learning_rate": 1.8130781543437698e-06, + "loss": 0.7583, + "step": 4750 + }, + { + "epoch": 2.3569566836291425, + "grad_norm": 0.1306126749166303, + "learning_rate": 1.8121384750809623e-06, + "loss": 0.6905, + "step": 4751 + }, + { + "epoch": 2.357453146332382, + "grad_norm": 0.12784958982539274, + "learning_rate": 1.811198900946759e-06, + "loss": 0.6999, + "step": 4752 + }, + { + "epoch": 2.357949609035621, + "grad_norm": 0.13515705011245394, + "learning_rate": 1.81025943208476e-06, + "loss": 0.7064, + "step": 4753 + }, + { + "epoch": 2.3584460717388605, + "grad_norm": 0.13046415278522383, + "learning_rate": 1.8093200686385484e-06, + "loss": 0.7217, + "step": 4754 + }, + { + "epoch": 2.3589425344421002, + "grad_norm": 0.12991942766461034, + "learning_rate": 1.8083808107516892e-06, + "loss": 0.71, + "step": 4755 + }, + { + "epoch": 2.3594389971453396, + "grad_norm": 0.13073051110349773, + "learning_rate": 1.8074416585677335e-06, + "loss": 0.6924, + "step": 4756 + }, + { + "epoch": 2.359935459848579, + "grad_norm": 0.13719409511910485, + "learning_rate": 1.8065026122302165e-06, + "loss": 0.7551, + "step": 4757 + }, + { + "epoch": 2.360431922551818, + "grad_norm": 0.12947943146773364, + "learning_rate": 1.8055636718826557e-06, + "loss": 0.6755, + "step": 4758 + }, + { + "epoch": 2.3609283852550575, + "grad_norm": 0.12841550971350973, + "learning_rate": 1.8046248376685532e-06, + "loss": 0.703, + "step": 4759 + }, + { + "epoch": 2.3614248479582973, + "grad_norm": 0.1311479945588234, + "learning_rate": 1.8036861097313946e-06, + "loss": 0.7219, + "step": 4760 + }, + { + "epoch": 2.3619213106615367, + "grad_norm": 0.12849688100852175, + "learning_rate": 1.8027474882146498e-06, + "loss": 0.6653, + "step": 4761 + }, + { + "epoch": 2.362417773364776, + "grad_norm": 0.13113427313547274, + "learning_rate": 1.8018089732617716e-06, + "loss": 0.7302, + "step": 4762 + }, + { + "epoch": 2.3629142360680153, + "grad_norm": 0.12964668689183, + "learning_rate": 1.8008705650161979e-06, + "loss": 0.7078, + "step": 4763 + }, + { + "epoch": 2.3634106987712546, + "grad_norm": 0.13260160889826755, + "learning_rate": 1.799932263621348e-06, + "loss": 0.7268, + "step": 4764 + }, + { + "epoch": 2.3639071614744944, + "grad_norm": 0.13006599121273307, + "learning_rate": 1.7989940692206267e-06, + "loss": 0.7147, + "step": 4765 + }, + { + "epoch": 2.3644036241777338, + "grad_norm": 0.13478322856677002, + "learning_rate": 1.7980559819574222e-06, + "loss": 0.7086, + "step": 4766 + }, + { + "epoch": 2.364900086880973, + "grad_norm": 0.1278201414701354, + "learning_rate": 1.7971180019751068e-06, + "loss": 0.6734, + "step": 4767 + }, + { + "epoch": 2.3653965495842124, + "grad_norm": 0.12854256872198694, + "learning_rate": 1.7961801294170344e-06, + "loss": 0.7037, + "step": 4768 + }, + { + "epoch": 2.3658930122874517, + "grad_norm": 0.1326157442892243, + "learning_rate": 1.7952423644265441e-06, + "loss": 0.7093, + "step": 4769 + }, + { + "epoch": 2.366389474990691, + "grad_norm": 0.13759009456859725, + "learning_rate": 1.7943047071469597e-06, + "loss": 0.7401, + "step": 4770 + }, + { + "epoch": 2.366885937693931, + "grad_norm": 0.13497019322133594, + "learning_rate": 1.7933671577215846e-06, + "loss": 0.7085, + "step": 4771 + }, + { + "epoch": 2.36738240039717, + "grad_norm": 0.12878040498696178, + "learning_rate": 1.7924297162937095e-06, + "loss": 0.6994, + "step": 4772 + }, + { + "epoch": 2.3678788631004095, + "grad_norm": 0.12913566113987018, + "learning_rate": 1.7914923830066074e-06, + "loss": 0.7142, + "step": 4773 + }, + { + "epoch": 2.368375325803649, + "grad_norm": 0.13029016007788516, + "learning_rate": 1.790555158003535e-06, + "loss": 0.6907, + "step": 4774 + }, + { + "epoch": 2.3688717885068886, + "grad_norm": 0.1317576623479519, + "learning_rate": 1.7896180414277308e-06, + "loss": 0.7187, + "step": 4775 + }, + { + "epoch": 2.369368251210128, + "grad_norm": 0.13402471815823666, + "learning_rate": 1.7886810334224192e-06, + "loss": 0.6727, + "step": 4776 + }, + { + "epoch": 2.3698647139133673, + "grad_norm": 0.12910099553816476, + "learning_rate": 1.7877441341308065e-06, + "loss": 0.6883, + "step": 4777 + }, + { + "epoch": 2.3703611766166066, + "grad_norm": 0.12857345243673593, + "learning_rate": 1.7868073436960826e-06, + "loss": 0.7037, + "step": 4778 + }, + { + "epoch": 2.370857639319846, + "grad_norm": 0.13050987325503174, + "learning_rate": 1.7858706622614217e-06, + "loss": 0.7294, + "step": 4779 + }, + { + "epoch": 2.3713541020230853, + "grad_norm": 0.12825871824045287, + "learning_rate": 1.784934089969979e-06, + "loss": 0.6861, + "step": 4780 + }, + { + "epoch": 2.371850564726325, + "grad_norm": 0.13089580339165302, + "learning_rate": 1.783997626964896e-06, + "loss": 0.7541, + "step": 4781 + }, + { + "epoch": 2.3723470274295644, + "grad_norm": 0.1334628464244765, + "learning_rate": 1.783061273389295e-06, + "loss": 0.7078, + "step": 4782 + }, + { + "epoch": 2.3728434901328037, + "grad_norm": 0.1340782941366848, + "learning_rate": 1.7821250293862836e-06, + "loss": 0.7718, + "step": 4783 + }, + { + "epoch": 2.373339952836043, + "grad_norm": 0.13228953216571118, + "learning_rate": 1.7811888950989515e-06, + "loss": 0.7161, + "step": 4784 + }, + { + "epoch": 2.373836415539283, + "grad_norm": 0.1351301578501817, + "learning_rate": 1.780252870670371e-06, + "loss": 0.6803, + "step": 4785 + }, + { + "epoch": 2.374332878242522, + "grad_norm": 0.12686143379738357, + "learning_rate": 1.7793169562435996e-06, + "loss": 0.6786, + "step": 4786 + }, + { + "epoch": 2.3748293409457615, + "grad_norm": 0.12977295444856893, + "learning_rate": 1.7783811519616756e-06, + "loss": 0.7315, + "step": 4787 + }, + { + "epoch": 2.375325803649001, + "grad_norm": 0.13124111123654278, + "learning_rate": 1.777445457967622e-06, + "loss": 0.7298, + "step": 4788 + }, + { + "epoch": 2.37582226635224, + "grad_norm": 0.13452592431133958, + "learning_rate": 1.7765098744044452e-06, + "loss": 0.6963, + "step": 4789 + }, + { + "epoch": 2.3763187290554795, + "grad_norm": 0.13429719254562902, + "learning_rate": 1.7755744014151338e-06, + "loss": 0.6756, + "step": 4790 + }, + { + "epoch": 2.3768151917587192, + "grad_norm": 0.12721035894910274, + "learning_rate": 1.7746390391426598e-06, + "loss": 0.6617, + "step": 4791 + }, + { + "epoch": 2.3773116544619586, + "grad_norm": 0.12936319272862273, + "learning_rate": 1.7737037877299784e-06, + "loss": 0.683, + "step": 4792 + }, + { + "epoch": 2.377808117165198, + "grad_norm": 0.13813116770259098, + "learning_rate": 1.7727686473200276e-06, + "loss": 0.7706, + "step": 4793 + }, + { + "epoch": 2.3783045798684372, + "grad_norm": 0.13184854493613757, + "learning_rate": 1.771833618055729e-06, + "loss": 0.7047, + "step": 4794 + }, + { + "epoch": 2.378801042571677, + "grad_norm": 0.1326136257727315, + "learning_rate": 1.7708987000799866e-06, + "loss": 0.6991, + "step": 4795 + }, + { + "epoch": 2.3792975052749163, + "grad_norm": 0.13583743329061757, + "learning_rate": 1.7699638935356876e-06, + "loss": 0.7719, + "step": 4796 + }, + { + "epoch": 2.3797939679781557, + "grad_norm": 0.13008368245130267, + "learning_rate": 1.7690291985657021e-06, + "loss": 0.6925, + "step": 4797 + }, + { + "epoch": 2.380290430681395, + "grad_norm": 0.12928279258704853, + "learning_rate": 1.7680946153128833e-06, + "loss": 0.6964, + "step": 4798 + }, + { + "epoch": 2.3807868933846343, + "grad_norm": 0.126896632250804, + "learning_rate": 1.7671601439200675e-06, + "loss": 0.7119, + "step": 4799 + }, + { + "epoch": 2.3812833560878737, + "grad_norm": 0.1343279347693961, + "learning_rate": 1.766225784530074e-06, + "loss": 0.7165, + "step": 4800 + }, + { + "epoch": 2.3817798187911134, + "grad_norm": 0.12647325466363368, + "learning_rate": 1.7652915372857035e-06, + "loss": 0.6949, + "step": 4801 + }, + { + "epoch": 2.3822762814943528, + "grad_norm": 0.1300158284756839, + "learning_rate": 1.7643574023297424e-06, + "loss": 0.7007, + "step": 4802 + }, + { + "epoch": 2.382772744197592, + "grad_norm": 0.1303069401818913, + "learning_rate": 1.7634233798049563e-06, + "loss": 0.6973, + "step": 4803 + }, + { + "epoch": 2.3832692069008314, + "grad_norm": 0.1250185249073528, + "learning_rate": 1.7624894698540964e-06, + "loss": 0.6759, + "step": 4804 + }, + { + "epoch": 2.383765669604071, + "grad_norm": 0.12683133498041957, + "learning_rate": 1.7615556726198963e-06, + "loss": 0.6577, + "step": 4805 + }, + { + "epoch": 2.3842621323073105, + "grad_norm": 0.13065274070747337, + "learning_rate": 1.7606219882450713e-06, + "loss": 0.7088, + "step": 4806 + }, + { + "epoch": 2.38475859501055, + "grad_norm": 0.13340077639503192, + "learning_rate": 1.7596884168723209e-06, + "loss": 0.7117, + "step": 4807 + }, + { + "epoch": 2.385255057713789, + "grad_norm": 0.12846892823423328, + "learning_rate": 1.7587549586443253e-06, + "loss": 0.7508, + "step": 4808 + }, + { + "epoch": 2.3857515204170285, + "grad_norm": 0.133668088002894, + "learning_rate": 1.7578216137037496e-06, + "loss": 0.7036, + "step": 4809 + }, + { + "epoch": 2.386247983120268, + "grad_norm": 0.1297358762173912, + "learning_rate": 1.75688838219324e-06, + "loss": 0.6972, + "step": 4810 + }, + { + "epoch": 2.3867444458235076, + "grad_norm": 0.12907419259927128, + "learning_rate": 1.755955264255427e-06, + "loss": 0.6749, + "step": 4811 + }, + { + "epoch": 2.387240908526747, + "grad_norm": 0.14320560575199254, + "learning_rate": 1.7550222600329214e-06, + "loss": 0.7493, + "step": 4812 + }, + { + "epoch": 2.3877373712299863, + "grad_norm": 0.13107914192089937, + "learning_rate": 1.7540893696683187e-06, + "loss": 0.6801, + "step": 4813 + }, + { + "epoch": 2.3882338339332256, + "grad_norm": 0.1299399190627306, + "learning_rate": 1.753156593304196e-06, + "loss": 0.6749, + "step": 4814 + }, + { + "epoch": 2.3887302966364654, + "grad_norm": 0.12936932954561534, + "learning_rate": 1.7522239310831134e-06, + "loss": 0.7269, + "step": 4815 + }, + { + "epoch": 2.3892267593397047, + "grad_norm": 0.13265135552744697, + "learning_rate": 1.7512913831476135e-06, + "loss": 0.7383, + "step": 4816 + }, + { + "epoch": 2.389723222042944, + "grad_norm": 0.13764593191752025, + "learning_rate": 1.750358949640221e-06, + "loss": 0.7176, + "step": 4817 + }, + { + "epoch": 2.3902196847461834, + "grad_norm": 0.12796750916884678, + "learning_rate": 1.749426630703444e-06, + "loss": 0.6821, + "step": 4818 + }, + { + "epoch": 2.3907161474494227, + "grad_norm": 0.13180747695825185, + "learning_rate": 1.7484944264797713e-06, + "loss": 0.7342, + "step": 4819 + }, + { + "epoch": 2.391212610152662, + "grad_norm": 0.14126658198697672, + "learning_rate": 1.7475623371116759e-06, + "loss": 0.7135, + "step": 4820 + }, + { + "epoch": 2.391709072855902, + "grad_norm": 0.1305260276629293, + "learning_rate": 1.7466303627416129e-06, + "loss": 0.7204, + "step": 4821 + }, + { + "epoch": 2.392205535559141, + "grad_norm": 0.12966913916891723, + "learning_rate": 1.7456985035120194e-06, + "loss": 0.6982, + "step": 4822 + }, + { + "epoch": 2.3927019982623805, + "grad_norm": 0.1254585082749373, + "learning_rate": 1.7447667595653162e-06, + "loss": 0.6747, + "step": 4823 + }, + { + "epoch": 2.39319846096562, + "grad_norm": 0.15094163056194543, + "learning_rate": 1.7438351310439036e-06, + "loss": 0.7751, + "step": 4824 + }, + { + "epoch": 2.3936949236688596, + "grad_norm": 0.12776045500839703, + "learning_rate": 1.7429036180901674e-06, + "loss": 0.7159, + "step": 4825 + }, + { + "epoch": 2.394191386372099, + "grad_norm": 0.13208737547487745, + "learning_rate": 1.741972220846474e-06, + "loss": 0.67, + "step": 4826 + }, + { + "epoch": 2.3946878490753383, + "grad_norm": 0.12814416225344802, + "learning_rate": 1.741040939455173e-06, + "loss": 0.6881, + "step": 4827 + }, + { + "epoch": 2.3951843117785776, + "grad_norm": 0.1329316175712077, + "learning_rate": 1.7401097740585947e-06, + "loss": 0.7058, + "step": 4828 + }, + { + "epoch": 2.395680774481817, + "grad_norm": 0.1314293899989232, + "learning_rate": 1.7391787247990537e-06, + "loss": 0.6882, + "step": 4829 + }, + { + "epoch": 2.3961772371850563, + "grad_norm": 0.13031161109059494, + "learning_rate": 1.7382477918188462e-06, + "loss": 0.7541, + "step": 4830 + }, + { + "epoch": 2.396673699888296, + "grad_norm": 0.12930814709648858, + "learning_rate": 1.7373169752602493e-06, + "loss": 0.7058, + "step": 4831 + }, + { + "epoch": 2.3971701625915354, + "grad_norm": 0.1291944574432241, + "learning_rate": 1.7363862752655248e-06, + "loss": 0.6674, + "step": 4832 + }, + { + "epoch": 2.3976666252947747, + "grad_norm": 0.1304229911398871, + "learning_rate": 1.735455691976914e-06, + "loss": 0.6796, + "step": 4833 + }, + { + "epoch": 2.398163087998014, + "grad_norm": 0.13562249887328157, + "learning_rate": 1.7345252255366434e-06, + "loss": 0.7102, + "step": 4834 + }, + { + "epoch": 2.398659550701254, + "grad_norm": 0.13155960598330008, + "learning_rate": 1.7335948760869175e-06, + "loss": 0.6952, + "step": 4835 + }, + { + "epoch": 2.399156013404493, + "grad_norm": 0.12956566494989916, + "learning_rate": 1.7326646437699262e-06, + "loss": 0.7399, + "step": 4836 + }, + { + "epoch": 2.3996524761077325, + "grad_norm": 0.129991409065886, + "learning_rate": 1.7317345287278408e-06, + "loss": 0.6942, + "step": 4837 + }, + { + "epoch": 2.400148938810972, + "grad_norm": 0.13550941541419204, + "learning_rate": 1.7308045311028148e-06, + "loss": 0.7386, + "step": 4838 + }, + { + "epoch": 2.400645401514211, + "grad_norm": 0.12897072490235445, + "learning_rate": 1.7298746510369836e-06, + "loss": 0.6888, + "step": 4839 + }, + { + "epoch": 2.4011418642174505, + "grad_norm": 0.12791367362789313, + "learning_rate": 1.728944888672463e-06, + "loss": 0.6921, + "step": 4840 + }, + { + "epoch": 2.4016383269206902, + "grad_norm": 0.12980510973949744, + "learning_rate": 1.7280152441513536e-06, + "loss": 0.6914, + "step": 4841 + }, + { + "epoch": 2.4021347896239296, + "grad_norm": 0.12698243358765066, + "learning_rate": 1.7270857176157363e-06, + "loss": 0.7243, + "step": 4842 + }, + { + "epoch": 2.402631252327169, + "grad_norm": 0.12943217000030752, + "learning_rate": 1.7261563092076742e-06, + "loss": 0.6894, + "step": 4843 + }, + { + "epoch": 2.4031277150304082, + "grad_norm": 0.14555273425787743, + "learning_rate": 1.7252270190692133e-06, + "loss": 0.7761, + "step": 4844 + }, + { + "epoch": 2.403624177733648, + "grad_norm": 0.13293572550265437, + "learning_rate": 1.724297847342379e-06, + "loss": 0.7029, + "step": 4845 + }, + { + "epoch": 2.4041206404368873, + "grad_norm": 0.12653010232213735, + "learning_rate": 1.7233687941691819e-06, + "loss": 0.6353, + "step": 4846 + }, + { + "epoch": 2.4046171031401267, + "grad_norm": 0.1314378535643785, + "learning_rate": 1.722439859691612e-06, + "loss": 0.698, + "step": 4847 + }, + { + "epoch": 2.405113565843366, + "grad_norm": 0.12696993067028972, + "learning_rate": 1.7215110440516427e-06, + "loss": 0.6477, + "step": 4848 + }, + { + "epoch": 2.4056100285466053, + "grad_norm": 0.1293761765904413, + "learning_rate": 1.720582347391228e-06, + "loss": 0.6863, + "step": 4849 + }, + { + "epoch": 2.4061064912498447, + "grad_norm": 0.12977487418429212, + "learning_rate": 1.7196537698523052e-06, + "loss": 0.6806, + "step": 4850 + }, + { + "epoch": 2.4066029539530844, + "grad_norm": 0.13188762511672084, + "learning_rate": 1.718725311576791e-06, + "loss": 0.7124, + "step": 4851 + }, + { + "epoch": 2.4070994166563238, + "grad_norm": 0.1290870421059128, + "learning_rate": 1.7177969727065861e-06, + "loss": 0.7406, + "step": 4852 + }, + { + "epoch": 2.407595879359563, + "grad_norm": 0.1324826867302518, + "learning_rate": 1.716868753383572e-06, + "loss": 0.7023, + "step": 4853 + }, + { + "epoch": 2.4080923420628024, + "grad_norm": 0.13190718413667324, + "learning_rate": 1.7159406537496127e-06, + "loss": 0.698, + "step": 4854 + }, + { + "epoch": 2.4085888047660418, + "grad_norm": 0.12929265660157932, + "learning_rate": 1.7150126739465534e-06, + "loss": 0.6906, + "step": 4855 + }, + { + "epoch": 2.4090852674692815, + "grad_norm": 0.12998790137705218, + "learning_rate": 1.7140848141162201e-06, + "loss": 0.663, + "step": 4856 + }, + { + "epoch": 2.409581730172521, + "grad_norm": 0.13078521035118826, + "learning_rate": 1.7131570744004216e-06, + "loss": 0.6851, + "step": 4857 + }, + { + "epoch": 2.41007819287576, + "grad_norm": 0.12843707540780416, + "learning_rate": 1.7122294549409486e-06, + "loss": 0.6911, + "step": 4858 + }, + { + "epoch": 2.4105746555789995, + "grad_norm": 0.13192234560191793, + "learning_rate": 1.7113019558795722e-06, + "loss": 0.7878, + "step": 4859 + }, + { + "epoch": 2.411071118282239, + "grad_norm": 0.1319340861365238, + "learning_rate": 1.7103745773580465e-06, + "loss": 0.7045, + "step": 4860 + }, + { + "epoch": 2.4115675809854786, + "grad_norm": 0.13307582097518272, + "learning_rate": 1.7094473195181054e-06, + "loss": 0.7248, + "step": 4861 + }, + { + "epoch": 2.412064043688718, + "grad_norm": 0.13305909648819517, + "learning_rate": 1.7085201825014663e-06, + "loss": 0.7069, + "step": 4862 + }, + { + "epoch": 2.4125605063919573, + "grad_norm": 0.12564507225345678, + "learning_rate": 1.7075931664498265e-06, + "loss": 0.6419, + "step": 4863 + }, + { + "epoch": 2.4130569690951966, + "grad_norm": 0.1353169682104955, + "learning_rate": 1.7066662715048668e-06, + "loss": 0.74, + "step": 4864 + }, + { + "epoch": 2.413553431798436, + "grad_norm": 0.12985183063208408, + "learning_rate": 1.7057394978082465e-06, + "loss": 0.6654, + "step": 4865 + }, + { + "epoch": 2.4140498945016757, + "grad_norm": 0.13177829235442678, + "learning_rate": 1.7048128455016105e-06, + "loss": 0.7362, + "step": 4866 + }, + { + "epoch": 2.414546357204915, + "grad_norm": 0.13043257360310376, + "learning_rate": 1.7038863147265796e-06, + "loss": 0.7091, + "step": 4867 + }, + { + "epoch": 2.4150428199081544, + "grad_norm": 0.12897077992222986, + "learning_rate": 1.702959905624761e-06, + "loss": 0.7194, + "step": 4868 + }, + { + "epoch": 2.4155392826113937, + "grad_norm": 0.129946311887381, + "learning_rate": 1.7020336183377414e-06, + "loss": 0.7313, + "step": 4869 + }, + { + "epoch": 2.416035745314633, + "grad_norm": 0.13239014442468938, + "learning_rate": 1.7011074530070888e-06, + "loss": 0.7355, + "step": 4870 + }, + { + "epoch": 2.416532208017873, + "grad_norm": 0.13601224242524987, + "learning_rate": 1.700181409774353e-06, + "loss": 0.7279, + "step": 4871 + }, + { + "epoch": 2.417028670721112, + "grad_norm": 0.13833920681618717, + "learning_rate": 1.6992554887810642e-06, + "loss": 0.7629, + "step": 4872 + }, + { + "epoch": 2.4175251334243515, + "grad_norm": 0.13419638193226116, + "learning_rate": 1.6983296901687351e-06, + "loss": 0.7244, + "step": 4873 + }, + { + "epoch": 2.418021596127591, + "grad_norm": 0.12689180227839011, + "learning_rate": 1.697404014078859e-06, + "loss": 0.6595, + "step": 4874 + }, + { + "epoch": 2.41851805883083, + "grad_norm": 0.13100363032562112, + "learning_rate": 1.6964784606529106e-06, + "loss": 0.6761, + "step": 4875 + }, + { + "epoch": 2.41901452153407, + "grad_norm": 0.13288360240949995, + "learning_rate": 1.6955530300323467e-06, + "loss": 0.7103, + "step": 4876 + }, + { + "epoch": 2.4195109842373093, + "grad_norm": 0.13073699217986937, + "learning_rate": 1.6946277223586033e-06, + "loss": 0.688, + "step": 4877 + }, + { + "epoch": 2.4200074469405486, + "grad_norm": 0.13046075666698376, + "learning_rate": 1.6937025377730992e-06, + "loss": 0.6997, + "step": 4878 + }, + { + "epoch": 2.420503909643788, + "grad_norm": 0.1375868918604195, + "learning_rate": 1.6927774764172344e-06, + "loss": 0.7279, + "step": 4879 + }, + { + "epoch": 2.4210003723470273, + "grad_norm": 0.1290611075411292, + "learning_rate": 1.6918525384323892e-06, + "loss": 0.6923, + "step": 4880 + }, + { + "epoch": 2.421496835050267, + "grad_norm": 0.12719530065201395, + "learning_rate": 1.6909277239599266e-06, + "loss": 0.7026, + "step": 4881 + }, + { + "epoch": 2.4219932977535064, + "grad_norm": 0.12404152026479529, + "learning_rate": 1.690003033141189e-06, + "loss": 0.6793, + "step": 4882 + }, + { + "epoch": 2.4224897604567457, + "grad_norm": 0.1281425281209562, + "learning_rate": 1.6890784661175003e-06, + "loss": 0.6684, + "step": 4883 + }, + { + "epoch": 2.422986223159985, + "grad_norm": 0.13530606073217435, + "learning_rate": 1.6881540230301654e-06, + "loss": 0.7593, + "step": 4884 + }, + { + "epoch": 2.4234826858632244, + "grad_norm": 0.13469127543994983, + "learning_rate": 1.687229704020471e-06, + "loss": 0.7071, + "step": 4885 + }, + { + "epoch": 2.423979148566464, + "grad_norm": 0.13832168876093476, + "learning_rate": 1.6863055092296849e-06, + "loss": 0.7108, + "step": 4886 + }, + { + "epoch": 2.4244756112697035, + "grad_norm": 0.13226395995413887, + "learning_rate": 1.6853814387990553e-06, + "loss": 0.7019, + "step": 4887 + }, + { + "epoch": 2.424972073972943, + "grad_norm": 0.13201504974645803, + "learning_rate": 1.684457492869811e-06, + "loss": 0.7436, + "step": 4888 + }, + { + "epoch": 2.425468536676182, + "grad_norm": 0.12931429802413458, + "learning_rate": 1.683533671583163e-06, + "loss": 0.7012, + "step": 4889 + }, + { + "epoch": 2.4259649993794214, + "grad_norm": 0.13156592559267707, + "learning_rate": 1.6826099750803015e-06, + "loss": 0.7484, + "step": 4890 + }, + { + "epoch": 2.4264614620826612, + "grad_norm": 0.12950517255649976, + "learning_rate": 1.6816864035024e-06, + "loss": 0.7049, + "step": 4891 + }, + { + "epoch": 2.4269579247859006, + "grad_norm": 0.12953295412900734, + "learning_rate": 1.6807629569906113e-06, + "loss": 0.7068, + "step": 4892 + }, + { + "epoch": 2.42745438748914, + "grad_norm": 0.13090036376760572, + "learning_rate": 1.679839635686069e-06, + "loss": 0.705, + "step": 4893 + }, + { + "epoch": 2.427950850192379, + "grad_norm": 0.12975357279758154, + "learning_rate": 1.678916439729888e-06, + "loss": 0.6817, + "step": 4894 + }, + { + "epoch": 2.4284473128956185, + "grad_norm": 0.13799080774336994, + "learning_rate": 1.6779933692631639e-06, + "loss": 0.7595, + "step": 4895 + }, + { + "epoch": 2.4289437755988583, + "grad_norm": 0.1350060724395047, + "learning_rate": 1.6770704244269737e-06, + "loss": 0.7284, + "step": 4896 + }, + { + "epoch": 2.4294402383020977, + "grad_norm": 0.1303139802763453, + "learning_rate": 1.6761476053623748e-06, + "loss": 0.7219, + "step": 4897 + }, + { + "epoch": 2.429936701005337, + "grad_norm": 0.1273561658716899, + "learning_rate": 1.675224912210405e-06, + "loss": 0.7254, + "step": 4898 + }, + { + "epoch": 2.4304331637085763, + "grad_norm": 0.13457224415207078, + "learning_rate": 1.6743023451120831e-06, + "loss": 0.7421, + "step": 4899 + }, + { + "epoch": 2.4309296264118156, + "grad_norm": 0.13264396885754928, + "learning_rate": 1.673379904208408e-06, + "loss": 0.7332, + "step": 4900 + }, + { + "epoch": 2.4314260891150554, + "grad_norm": 0.1349589055432028, + "learning_rate": 1.672457589640361e-06, + "loss": 0.7287, + "step": 4901 + }, + { + "epoch": 2.4319225518182948, + "grad_norm": 0.13262022984457753, + "learning_rate": 1.6715354015489028e-06, + "loss": 0.7355, + "step": 4902 + }, + { + "epoch": 2.432419014521534, + "grad_norm": 0.13029145762137417, + "learning_rate": 1.6706133400749752e-06, + "loss": 0.7185, + "step": 4903 + }, + { + "epoch": 2.4329154772247734, + "grad_norm": 0.12961039350323814, + "learning_rate": 1.6696914053595004e-06, + "loss": 0.7257, + "step": 4904 + }, + { + "epoch": 2.4334119399280127, + "grad_norm": 0.13202507745720699, + "learning_rate": 1.668769597543381e-06, + "loss": 0.6851, + "step": 4905 + }, + { + "epoch": 2.4339084026312525, + "grad_norm": 0.13315852105173265, + "learning_rate": 1.6678479167675005e-06, + "loss": 0.7019, + "step": 4906 + }, + { + "epoch": 2.434404865334492, + "grad_norm": 0.1407425970112104, + "learning_rate": 1.6669263631727239e-06, + "loss": 0.7146, + "step": 4907 + }, + { + "epoch": 2.434901328037731, + "grad_norm": 0.13143176907968773, + "learning_rate": 1.6660049368998958e-06, + "loss": 0.7233, + "step": 4908 + }, + { + "epoch": 2.4353977907409705, + "grad_norm": 0.12643932478055675, + "learning_rate": 1.6650836380898402e-06, + "loss": 0.6687, + "step": 4909 + }, + { + "epoch": 2.43589425344421, + "grad_norm": 0.12854404310842657, + "learning_rate": 1.664162466883364e-06, + "loss": 0.7138, + "step": 4910 + }, + { + "epoch": 2.436390716147449, + "grad_norm": 0.1334591427590283, + "learning_rate": 1.663241423421253e-06, + "loss": 0.6816, + "step": 4911 + }, + { + "epoch": 2.436887178850689, + "grad_norm": 0.1293428646101715, + "learning_rate": 1.6623205078442739e-06, + "loss": 0.7063, + "step": 4912 + }, + { + "epoch": 2.4373836415539283, + "grad_norm": 0.13844369579181617, + "learning_rate": 1.6613997202931745e-06, + "loss": 0.7307, + "step": 4913 + }, + { + "epoch": 2.4378801042571676, + "grad_norm": 0.13252378360013214, + "learning_rate": 1.6604790609086818e-06, + "loss": 0.6987, + "step": 4914 + }, + { + "epoch": 2.438376566960407, + "grad_norm": 0.13126753203447522, + "learning_rate": 1.6595585298315043e-06, + "loss": 0.6512, + "step": 4915 + }, + { + "epoch": 2.4388730296636467, + "grad_norm": 0.13183951523709572, + "learning_rate": 1.6586381272023295e-06, + "loss": 0.7062, + "step": 4916 + }, + { + "epoch": 2.439369492366886, + "grad_norm": 0.13000918489742638, + "learning_rate": 1.6577178531618266e-06, + "loss": 0.6794, + "step": 4917 + }, + { + "epoch": 2.4398659550701254, + "grad_norm": 0.13272390835266695, + "learning_rate": 1.6567977078506447e-06, + "loss": 0.6733, + "step": 4918 + }, + { + "epoch": 2.4403624177733647, + "grad_norm": 0.13196802532399263, + "learning_rate": 1.6558776914094138e-06, + "loss": 0.6988, + "step": 4919 + }, + { + "epoch": 2.440858880476604, + "grad_norm": 0.13711894757545962, + "learning_rate": 1.6549578039787436e-06, + "loss": 0.7248, + "step": 4920 + }, + { + "epoch": 2.4413553431798434, + "grad_norm": 0.13100192331000934, + "learning_rate": 1.6540380456992234e-06, + "loss": 0.7116, + "step": 4921 + }, + { + "epoch": 2.441851805883083, + "grad_norm": 0.13106084542326454, + "learning_rate": 1.653118416711424e-06, + "loss": 0.6839, + "step": 4922 + }, + { + "epoch": 2.4423482685863225, + "grad_norm": 0.13151486257585226, + "learning_rate": 1.6521989171558958e-06, + "loss": 0.7221, + "step": 4923 + }, + { + "epoch": 2.442844731289562, + "grad_norm": 0.13197614090048165, + "learning_rate": 1.6512795471731702e-06, + "loss": 0.7504, + "step": 4924 + }, + { + "epoch": 2.443341193992801, + "grad_norm": 0.12679261823216995, + "learning_rate": 1.6503603069037572e-06, + "loss": 0.6952, + "step": 4925 + }, + { + "epoch": 2.443837656696041, + "grad_norm": 0.12936331957688477, + "learning_rate": 1.6494411964881482e-06, + "loss": 0.6724, + "step": 4926 + }, + { + "epoch": 2.4443341193992802, + "grad_norm": 0.1342595649174414, + "learning_rate": 1.6485222160668147e-06, + "loss": 0.721, + "step": 4927 + }, + { + "epoch": 2.4448305821025196, + "grad_norm": 0.13228023991573945, + "learning_rate": 1.6476033657802081e-06, + "loss": 0.7077, + "step": 4928 + }, + { + "epoch": 2.445327044805759, + "grad_norm": 0.13098966056504466, + "learning_rate": 1.6466846457687603e-06, + "loss": 0.6618, + "step": 4929 + }, + { + "epoch": 2.4458235075089982, + "grad_norm": 0.1292930748407539, + "learning_rate": 1.6457660561728827e-06, + "loss": 0.6647, + "step": 4930 + }, + { + "epoch": 2.4463199702122376, + "grad_norm": 0.13636772124500665, + "learning_rate": 1.6448475971329667e-06, + "loss": 0.7434, + "step": 4931 + }, + { + "epoch": 2.4468164329154773, + "grad_norm": 0.13511041258014103, + "learning_rate": 1.6439292687893838e-06, + "loss": 0.723, + "step": 4932 + }, + { + "epoch": 2.4473128956187167, + "grad_norm": 0.13305991113837243, + "learning_rate": 1.6430110712824857e-06, + "loss": 0.7419, + "step": 4933 + }, + { + "epoch": 2.447809358321956, + "grad_norm": 0.1284621762342019, + "learning_rate": 1.6420930047526048e-06, + "loss": 0.703, + "step": 4934 + }, + { + "epoch": 2.4483058210251953, + "grad_norm": 0.13208141055635828, + "learning_rate": 1.6411750693400527e-06, + "loss": 0.7114, + "step": 4935 + }, + { + "epoch": 2.448802283728435, + "grad_norm": 0.1334427283256501, + "learning_rate": 1.6402572651851217e-06, + "loss": 0.7029, + "step": 4936 + }, + { + "epoch": 2.4492987464316744, + "grad_norm": 0.13331062356506063, + "learning_rate": 1.639339592428082e-06, + "loss": 0.7153, + "step": 4937 + }, + { + "epoch": 2.4497952091349138, + "grad_norm": 0.13759074892380946, + "learning_rate": 1.638422051209186e-06, + "loss": 0.7888, + "step": 4938 + }, + { + "epoch": 2.450291671838153, + "grad_norm": 0.1283084309158128, + "learning_rate": 1.6375046416686652e-06, + "loss": 0.6724, + "step": 4939 + }, + { + "epoch": 2.4507881345413924, + "grad_norm": 0.13133438114077706, + "learning_rate": 1.6365873639467314e-06, + "loss": 0.7411, + "step": 4940 + }, + { + "epoch": 2.4512845972446318, + "grad_norm": 0.13217904256740817, + "learning_rate": 1.635670218183575e-06, + "loss": 0.685, + "step": 4941 + }, + { + "epoch": 2.4517810599478715, + "grad_norm": 0.13190452013229165, + "learning_rate": 1.6347532045193664e-06, + "loss": 0.7293, + "step": 4942 + }, + { + "epoch": 2.452277522651111, + "grad_norm": 0.13055191375790906, + "learning_rate": 1.6338363230942583e-06, + "loss": 0.7185, + "step": 4943 + }, + { + "epoch": 2.45277398535435, + "grad_norm": 0.13277302356989085, + "learning_rate": 1.6329195740483797e-06, + "loss": 0.746, + "step": 4944 + }, + { + "epoch": 2.4532704480575895, + "grad_norm": 0.13175605359588108, + "learning_rate": 1.6320029575218424e-06, + "loss": 0.7421, + "step": 4945 + }, + { + "epoch": 2.4537669107608293, + "grad_norm": 0.13186597595104096, + "learning_rate": 1.6310864736547352e-06, + "loss": 0.707, + "step": 4946 + }, + { + "epoch": 2.4542633734640686, + "grad_norm": 0.12766107544270525, + "learning_rate": 1.6301701225871297e-06, + "loss": 0.6824, + "step": 4947 + }, + { + "epoch": 2.454759836167308, + "grad_norm": 0.13352247929313615, + "learning_rate": 1.629253904459073e-06, + "loss": 0.716, + "step": 4948 + }, + { + "epoch": 2.4552562988705473, + "grad_norm": 0.12517152600846954, + "learning_rate": 1.6283378194105958e-06, + "loss": 0.688, + "step": 4949 + }, + { + "epoch": 2.4557527615737866, + "grad_norm": 0.1281114060367298, + "learning_rate": 1.627421867581707e-06, + "loss": 0.6663, + "step": 4950 + }, + { + "epoch": 2.456249224277026, + "grad_norm": 0.12627705318778099, + "learning_rate": 1.6265060491123945e-06, + "loss": 0.7013, + "step": 4951 + }, + { + "epoch": 2.4567456869802657, + "grad_norm": 0.1324199896128721, + "learning_rate": 1.6255903641426282e-06, + "loss": 0.725, + "step": 4952 + }, + { + "epoch": 2.457242149683505, + "grad_norm": 0.13004158500704271, + "learning_rate": 1.6246748128123537e-06, + "loss": 0.6722, + "step": 4953 + }, + { + "epoch": 2.4577386123867444, + "grad_norm": 0.13174097008700347, + "learning_rate": 1.6237593952614994e-06, + "loss": 0.6889, + "step": 4954 + }, + { + "epoch": 2.4582350750899837, + "grad_norm": 0.13318866205780763, + "learning_rate": 1.622844111629972e-06, + "loss": 0.7379, + "step": 4955 + }, + { + "epoch": 2.4587315377932235, + "grad_norm": 0.1337606359354983, + "learning_rate": 1.6219289620576583e-06, + "loss": 0.7546, + "step": 4956 + }, + { + "epoch": 2.459228000496463, + "grad_norm": 0.12844627082050522, + "learning_rate": 1.6210139466844244e-06, + "loss": 0.6996, + "step": 4957 + }, + { + "epoch": 2.459724463199702, + "grad_norm": 0.12849347327276872, + "learning_rate": 1.6200990656501146e-06, + "loss": 0.7006, + "step": 4958 + }, + { + "epoch": 2.4602209259029415, + "grad_norm": 0.12716771704033275, + "learning_rate": 1.6191843190945544e-06, + "loss": 0.7079, + "step": 4959 + }, + { + "epoch": 2.460717388606181, + "grad_norm": 0.1313359040978842, + "learning_rate": 1.618269707157548e-06, + "loss": 0.705, + "step": 4960 + }, + { + "epoch": 2.46121385130942, + "grad_norm": 0.1332264821711248, + "learning_rate": 1.6173552299788805e-06, + "loss": 0.7429, + "step": 4961 + }, + { + "epoch": 2.46171031401266, + "grad_norm": 0.12976033946371668, + "learning_rate": 1.616440887698313e-06, + "loss": 0.6886, + "step": 4962 + }, + { + "epoch": 2.4622067767158993, + "grad_norm": 0.1275893351046771, + "learning_rate": 1.6155266804555903e-06, + "loss": 0.7263, + "step": 4963 + }, + { + "epoch": 2.4627032394191386, + "grad_norm": 0.13229798618783215, + "learning_rate": 1.614612608390432e-06, + "loss": 0.7365, + "step": 4964 + }, + { + "epoch": 2.463199702122378, + "grad_norm": 0.12896939980843639, + "learning_rate": 1.6136986716425404e-06, + "loss": 0.6556, + "step": 4965 + }, + { + "epoch": 2.4636961648256177, + "grad_norm": 0.1288248754058537, + "learning_rate": 1.6127848703515962e-06, + "loss": 0.6835, + "step": 4966 + }, + { + "epoch": 2.464192627528857, + "grad_norm": 0.13426674389004808, + "learning_rate": 1.6118712046572587e-06, + "loss": 0.7412, + "step": 4967 + }, + { + "epoch": 2.4646890902320964, + "grad_norm": 0.13038176699779896, + "learning_rate": 1.6109576746991684e-06, + "loss": 0.6967, + "step": 4968 + }, + { + "epoch": 2.4651855529353357, + "grad_norm": 0.13301186253518696, + "learning_rate": 1.6100442806169423e-06, + "loss": 0.7605, + "step": 4969 + }, + { + "epoch": 2.465682015638575, + "grad_norm": 0.13134857714155998, + "learning_rate": 1.6091310225501782e-06, + "loss": 0.7556, + "step": 4970 + }, + { + "epoch": 2.4661784783418144, + "grad_norm": 0.1332594834907538, + "learning_rate": 1.6082179006384535e-06, + "loss": 0.7556, + "step": 4971 + }, + { + "epoch": 2.466674941045054, + "grad_norm": 0.1250597053615197, + "learning_rate": 1.6073049150213243e-06, + "loss": 0.6891, + "step": 4972 + }, + { + "epoch": 2.4671714037482935, + "grad_norm": 0.1321067771404992, + "learning_rate": 1.6063920658383258e-06, + "loss": 0.6851, + "step": 4973 + }, + { + "epoch": 2.467667866451533, + "grad_norm": 0.12621805146652318, + "learning_rate": 1.605479353228972e-06, + "loss": 0.6641, + "step": 4974 + }, + { + "epoch": 2.468164329154772, + "grad_norm": 0.12940412380904856, + "learning_rate": 1.6045667773327562e-06, + "loss": 0.6976, + "step": 4975 + }, + { + "epoch": 2.468660791858012, + "grad_norm": 0.13197779767436546, + "learning_rate": 1.6036543382891512e-06, + "loss": 0.7368, + "step": 4976 + }, + { + "epoch": 2.4691572545612512, + "grad_norm": 0.1322531297862413, + "learning_rate": 1.6027420362376092e-06, + "loss": 0.7481, + "step": 4977 + }, + { + "epoch": 2.4696537172644906, + "grad_norm": 0.1269834035874267, + "learning_rate": 1.6018298713175602e-06, + "loss": 0.6829, + "step": 4978 + }, + { + "epoch": 2.47015017996773, + "grad_norm": 0.13354786856232534, + "learning_rate": 1.6009178436684153e-06, + "loss": 0.7016, + "step": 4979 + }, + { + "epoch": 2.4706466426709692, + "grad_norm": 0.13062340464990232, + "learning_rate": 1.6000059534295614e-06, + "loss": 0.7193, + "step": 4980 + }, + { + "epoch": 2.4711431053742086, + "grad_norm": 0.1306702771269536, + "learning_rate": 1.599094200740367e-06, + "loss": 0.7059, + "step": 4981 + }, + { + "epoch": 2.4716395680774483, + "grad_norm": 0.13005823715313075, + "learning_rate": 1.5981825857401796e-06, + "loss": 0.6991, + "step": 4982 + }, + { + "epoch": 2.4721360307806877, + "grad_norm": 0.12974971867119356, + "learning_rate": 1.5972711085683241e-06, + "loss": 0.7102, + "step": 4983 + }, + { + "epoch": 2.472632493483927, + "grad_norm": 0.12960690860093224, + "learning_rate": 1.596359769364106e-06, + "loss": 0.6814, + "step": 4984 + }, + { + "epoch": 2.4731289561871663, + "grad_norm": 0.1264988539550879, + "learning_rate": 1.5954485682668075e-06, + "loss": 0.6807, + "step": 4985 + }, + { + "epoch": 2.473625418890406, + "grad_norm": 0.13011994137414026, + "learning_rate": 1.5945375054156926e-06, + "loss": 0.7104, + "step": 4986 + }, + { + "epoch": 2.4741218815936454, + "grad_norm": 0.14027792550347323, + "learning_rate": 1.5936265809500012e-06, + "loss": 0.7041, + "step": 4987 + }, + { + "epoch": 2.4746183442968848, + "grad_norm": 0.13203696744805918, + "learning_rate": 1.592715795008955e-06, + "loss": 0.7102, + "step": 4988 + }, + { + "epoch": 2.475114807000124, + "grad_norm": 0.1318975498818065, + "learning_rate": 1.5918051477317525e-06, + "loss": 0.7065, + "step": 4989 + }, + { + "epoch": 2.4756112697033634, + "grad_norm": 0.13322263973824197, + "learning_rate": 1.5908946392575713e-06, + "loss": 0.7667, + "step": 4990 + }, + { + "epoch": 2.4761077324066028, + "grad_norm": 0.1275784020608823, + "learning_rate": 1.5899842697255676e-06, + "loss": 0.7045, + "step": 4991 + }, + { + "epoch": 2.4766041951098425, + "grad_norm": 0.12621490198653326, + "learning_rate": 1.5890740392748778e-06, + "loss": 0.6955, + "step": 4992 + }, + { + "epoch": 2.477100657813082, + "grad_norm": 0.13147433530074085, + "learning_rate": 1.588163948044615e-06, + "loss": 0.6624, + "step": 4993 + }, + { + "epoch": 2.477597120516321, + "grad_norm": 0.12864338251047736, + "learning_rate": 1.587253996173873e-06, + "loss": 0.6439, + "step": 4994 + }, + { + "epoch": 2.4780935832195605, + "grad_norm": 0.12915557397838578, + "learning_rate": 1.5863441838017235e-06, + "loss": 0.6478, + "step": 4995 + }, + { + "epoch": 2.4785900459228, + "grad_norm": 0.13124819867045107, + "learning_rate": 1.585434511067216e-06, + "loss": 0.763, + "step": 4996 + }, + { + "epoch": 2.4790865086260396, + "grad_norm": 0.13444097971021127, + "learning_rate": 1.5845249781093786e-06, + "loss": 0.7284, + "step": 4997 + }, + { + "epoch": 2.479582971329279, + "grad_norm": 0.12597972778964714, + "learning_rate": 1.5836155850672202e-06, + "loss": 0.6775, + "step": 4998 + }, + { + "epoch": 2.4800794340325183, + "grad_norm": 0.13275658755510444, + "learning_rate": 1.5827063320797266e-06, + "loss": 0.7051, + "step": 4999 + }, + { + "epoch": 2.4805758967357576, + "grad_norm": 0.12509208961914398, + "learning_rate": 1.5817972192858624e-06, + "loss": 0.7111, + "step": 5000 + }, + { + "epoch": 2.481072359438997, + "grad_norm": 0.12842493642077443, + "learning_rate": 1.5808882468245706e-06, + "loss": 0.6731, + "step": 5001 + }, + { + "epoch": 2.4815688221422367, + "grad_norm": 0.13994016275679041, + "learning_rate": 1.5799794148347738e-06, + "loss": 0.7332, + "step": 5002 + }, + { + "epoch": 2.482065284845476, + "grad_norm": 0.13160007453858913, + "learning_rate": 1.579070723455372e-06, + "loss": 0.7184, + "step": 5003 + }, + { + "epoch": 2.4825617475487154, + "grad_norm": 0.12476420674542148, + "learning_rate": 1.5781621728252439e-06, + "loss": 0.7323, + "step": 5004 + }, + { + "epoch": 2.4830582102519547, + "grad_norm": 0.13342172172903713, + "learning_rate": 1.5772537630832477e-06, + "loss": 0.6979, + "step": 5005 + }, + { + "epoch": 2.483554672955194, + "grad_norm": 0.13261519543470868, + "learning_rate": 1.5763454943682183e-06, + "loss": 0.6798, + "step": 5006 + }, + { + "epoch": 2.484051135658434, + "grad_norm": 0.13196930690487815, + "learning_rate": 1.5754373668189703e-06, + "loss": 0.6926, + "step": 5007 + }, + { + "epoch": 2.484547598361673, + "grad_norm": 0.1284437605438022, + "learning_rate": 1.5745293805742968e-06, + "loss": 0.7104, + "step": 5008 + }, + { + "epoch": 2.4850440610649125, + "grad_norm": 0.1271584061825038, + "learning_rate": 1.5736215357729683e-06, + "loss": 0.7149, + "step": 5009 + }, + { + "epoch": 2.485540523768152, + "grad_norm": 0.12986340829886878, + "learning_rate": 1.5727138325537355e-06, + "loss": 0.7077, + "step": 5010 + }, + { + "epoch": 2.486036986471391, + "grad_norm": 0.13788400838359663, + "learning_rate": 1.5718062710553253e-06, + "loss": 0.751, + "step": 5011 + }, + { + "epoch": 2.486533449174631, + "grad_norm": 0.1348639652151765, + "learning_rate": 1.5708988514164442e-06, + "loss": 0.7636, + "step": 5012 + }, + { + "epoch": 2.4870299118778703, + "grad_norm": 0.13129936769800069, + "learning_rate": 1.569991573775776e-06, + "loss": 0.6946, + "step": 5013 + }, + { + "epoch": 2.4875263745811096, + "grad_norm": 0.131016067710159, + "learning_rate": 1.5690844382719844e-06, + "loss": 0.7299, + "step": 5014 + }, + { + "epoch": 2.488022837284349, + "grad_norm": 0.13469638317810959, + "learning_rate": 1.5681774450437104e-06, + "loss": 0.7123, + "step": 5015 + }, + { + "epoch": 2.4885192999875883, + "grad_norm": 0.12704079453926578, + "learning_rate": 1.5672705942295735e-06, + "loss": 0.6537, + "step": 5016 + }, + { + "epoch": 2.489015762690828, + "grad_norm": 0.12753576550951762, + "learning_rate": 1.5663638859681706e-06, + "loss": 0.7195, + "step": 5017 + }, + { + "epoch": 2.4895122253940674, + "grad_norm": 0.13049316211920853, + "learning_rate": 1.5654573203980782e-06, + "loss": 0.7029, + "step": 5018 + }, + { + "epoch": 2.4900086880973067, + "grad_norm": 0.12674348845812947, + "learning_rate": 1.5645508976578501e-06, + "loss": 0.6749, + "step": 5019 + }, + { + "epoch": 2.490505150800546, + "grad_norm": 0.13119282880613511, + "learning_rate": 1.5636446178860188e-06, + "loss": 0.7071, + "step": 5020 + }, + { + "epoch": 2.4910016135037854, + "grad_norm": 0.12857653431579513, + "learning_rate": 1.5627384812210945e-06, + "loss": 0.7098, + "step": 5021 + }, + { + "epoch": 2.491498076207025, + "grad_norm": 0.1299396609011369, + "learning_rate": 1.561832487801565e-06, + "loss": 0.7165, + "step": 5022 + }, + { + "epoch": 2.4919945389102645, + "grad_norm": 0.13187761577665433, + "learning_rate": 1.560926637765897e-06, + "loss": 0.6788, + "step": 5023 + }, + { + "epoch": 2.492491001613504, + "grad_norm": 0.13676658429929037, + "learning_rate": 1.560020931252536e-06, + "loss": 0.774, + "step": 5024 + }, + { + "epoch": 2.492987464316743, + "grad_norm": 0.2391281093395671, + "learning_rate": 1.5591153683999045e-06, + "loss": 0.7162, + "step": 5025 + }, + { + "epoch": 2.4934839270199824, + "grad_norm": 0.1321280807778348, + "learning_rate": 1.5582099493464032e-06, + "loss": 0.7193, + "step": 5026 + }, + { + "epoch": 2.4939803897232222, + "grad_norm": 0.12836015141870605, + "learning_rate": 1.5573046742304104e-06, + "loss": 0.7098, + "step": 5027 + }, + { + "epoch": 2.4944768524264616, + "grad_norm": 0.12731919945775716, + "learning_rate": 1.5563995431902834e-06, + "loss": 0.7006, + "step": 5028 + }, + { + "epoch": 2.494973315129701, + "grad_norm": 0.129683691750222, + "learning_rate": 1.5554945563643564e-06, + "loss": 0.7271, + "step": 5029 + }, + { + "epoch": 2.49546977783294, + "grad_norm": 0.12619265071805652, + "learning_rate": 1.5545897138909423e-06, + "loss": 0.7269, + "step": 5030 + }, + { + "epoch": 2.4959662405361795, + "grad_norm": 0.12864219160660795, + "learning_rate": 1.5536850159083319e-06, + "loss": 0.7173, + "step": 5031 + }, + { + "epoch": 2.4964627032394193, + "grad_norm": 0.12745661281840148, + "learning_rate": 1.5527804625547937e-06, + "loss": 0.6913, + "step": 5032 + }, + { + "epoch": 2.4969591659426587, + "grad_norm": 0.12858809922709444, + "learning_rate": 1.5518760539685752e-06, + "loss": 0.7134, + "step": 5033 + }, + { + "epoch": 2.497455628645898, + "grad_norm": 0.12827890445554876, + "learning_rate": 1.550971790287899e-06, + "loss": 0.6648, + "step": 5034 + }, + { + "epoch": 2.4979520913491373, + "grad_norm": 0.13300762899595023, + "learning_rate": 1.5500676716509683e-06, + "loss": 0.7096, + "step": 5035 + }, + { + "epoch": 2.4984485540523766, + "grad_norm": 0.13329585467541116, + "learning_rate": 1.549163698195963e-06, + "loss": 0.739, + "step": 5036 + }, + { + "epoch": 2.4989450167556164, + "grad_norm": 0.13351068526158444, + "learning_rate": 1.5482598700610412e-06, + "loss": 0.7359, + "step": 5037 + }, + { + "epoch": 2.4994414794588558, + "grad_norm": 0.13746974435710943, + "learning_rate": 1.5473561873843378e-06, + "loss": 0.7429, + "step": 5038 + }, + { + "epoch": 2.499937942162095, + "grad_norm": 0.1311946484491049, + "learning_rate": 1.5464526503039668e-06, + "loss": 0.7333, + "step": 5039 + }, + { + "epoch": 2.5004344048653344, + "grad_norm": 0.13003936186444456, + "learning_rate": 1.5455492589580195e-06, + "loss": 0.6862, + "step": 5040 + }, + { + "epoch": 2.5004344048653344, + "eval_loss": 0.7251803278923035, + "eval_runtime": 135.8229, + "eval_samples_per_second": 223.475, + "eval_steps_per_second": 27.941, + "step": 5040 + }, + { + "epoch": 2.5009308675685737, + "grad_norm": 0.12908028509507846, + "learning_rate": 1.5446460134845642e-06, + "loss": 0.6778, + "step": 5041 + }, + { + "epoch": 2.501427330271813, + "grad_norm": 0.13166843080491383, + "learning_rate": 1.543742914021648e-06, + "loss": 0.7092, + "step": 5042 + }, + { + "epoch": 2.501923792975053, + "grad_norm": 0.13088243606617905, + "learning_rate": 1.5428399607072956e-06, + "loss": 0.7373, + "step": 5043 + }, + { + "epoch": 2.502420255678292, + "grad_norm": 0.12629275270854762, + "learning_rate": 1.541937153679508e-06, + "loss": 0.708, + "step": 5044 + }, + { + "epoch": 2.5029167183815315, + "grad_norm": 0.13225143684011295, + "learning_rate": 1.5410344930762648e-06, + "loss": 0.701, + "step": 5045 + }, + { + "epoch": 2.503413181084771, + "grad_norm": 0.13340939597155863, + "learning_rate": 1.5401319790355232e-06, + "loss": 0.7098, + "step": 5046 + }, + { + "epoch": 2.5039096437880106, + "grad_norm": 0.13043769011930573, + "learning_rate": 1.5392296116952181e-06, + "loss": 0.6952, + "step": 5047 + }, + { + "epoch": 2.50440610649125, + "grad_norm": 0.13682071863988435, + "learning_rate": 1.5383273911932627e-06, + "loss": 0.7542, + "step": 5048 + }, + { + "epoch": 2.5049025691944893, + "grad_norm": 0.13032047556352128, + "learning_rate": 1.5374253176675464e-06, + "loss": 0.6926, + "step": 5049 + }, + { + "epoch": 2.5053990318977286, + "grad_norm": 0.13244019431180845, + "learning_rate": 1.5365233912559364e-06, + "loss": 0.7175, + "step": 5050 + }, + { + "epoch": 2.505895494600968, + "grad_norm": 0.13817001240133622, + "learning_rate": 1.5356216120962774e-06, + "loss": 0.7282, + "step": 5051 + }, + { + "epoch": 2.5063919573042073, + "grad_norm": 0.13140272145786022, + "learning_rate": 1.5347199803263927e-06, + "loss": 0.6855, + "step": 5052 + }, + { + "epoch": 2.506888420007447, + "grad_norm": 0.1360933004526906, + "learning_rate": 1.5338184960840824e-06, + "loss": 0.7409, + "step": 5053 + }, + { + "epoch": 2.5073848827106864, + "grad_norm": 0.13091513330256985, + "learning_rate": 1.5329171595071227e-06, + "loss": 0.6919, + "step": 5054 + }, + { + "epoch": 2.5078813454139257, + "grad_norm": 0.1265533074222129, + "learning_rate": 1.5320159707332695e-06, + "loss": 0.6243, + "step": 5055 + }, + { + "epoch": 2.508377808117165, + "grad_norm": 0.1252099518570422, + "learning_rate": 1.5311149299002542e-06, + "loss": 0.6715, + "step": 5056 + }, + { + "epoch": 2.508874270820405, + "grad_norm": 0.13609272644415313, + "learning_rate": 1.5302140371457875e-06, + "loss": 0.732, + "step": 5057 + }, + { + "epoch": 2.509370733523644, + "grad_norm": 0.13323577694421218, + "learning_rate": 1.529313292607556e-06, + "loss": 0.6702, + "step": 5058 + }, + { + "epoch": 2.5098671962268835, + "grad_norm": 0.13041020040639237, + "learning_rate": 1.5284126964232244e-06, + "loss": 0.7071, + "step": 5059 + }, + { + "epoch": 2.510363658930123, + "grad_norm": 0.1304162413787588, + "learning_rate": 1.5275122487304337e-06, + "loss": 0.7312, + "step": 5060 + }, + { + "epoch": 2.510860121633362, + "grad_norm": 0.13021570150444117, + "learning_rate": 1.5266119496668025e-06, + "loss": 0.7283, + "step": 5061 + }, + { + "epoch": 2.5113565843366015, + "grad_norm": 0.1279339703409767, + "learning_rate": 1.5257117993699276e-06, + "loss": 0.696, + "step": 5062 + }, + { + "epoch": 2.5118530470398412, + "grad_norm": 0.13240644628046436, + "learning_rate": 1.524811797977383e-06, + "loss": 0.6864, + "step": 5063 + }, + { + "epoch": 2.5123495097430806, + "grad_norm": 0.1400762626467057, + "learning_rate": 1.523911945626719e-06, + "loss": 0.7678, + "step": 5064 + }, + { + "epoch": 2.51284597244632, + "grad_norm": 0.13014151678535776, + "learning_rate": 1.5230122424554644e-06, + "loss": 0.7036, + "step": 5065 + }, + { + "epoch": 2.5133424351495592, + "grad_norm": 0.12985990176866047, + "learning_rate": 1.5221126886011228e-06, + "loss": 0.7224, + "step": 5066 + }, + { + "epoch": 2.513838897852799, + "grad_norm": 0.1317976170535492, + "learning_rate": 1.5212132842011778e-06, + "loss": 0.6924, + "step": 5067 + }, + { + "epoch": 2.5143353605560383, + "grad_norm": 0.12970897949094737, + "learning_rate": 1.5203140293930888e-06, + "loss": 0.6995, + "step": 5068 + }, + { + "epoch": 2.5148318232592777, + "grad_norm": 0.1303319284929338, + "learning_rate": 1.519414924314292e-06, + "loss": 0.7018, + "step": 5069 + }, + { + "epoch": 2.515328285962517, + "grad_norm": 0.12850755690334908, + "learning_rate": 1.5185159691022023e-06, + "loss": 0.7186, + "step": 5070 + }, + { + "epoch": 2.5158247486657563, + "grad_norm": 0.12718389440933345, + "learning_rate": 1.517617163894209e-06, + "loss": 0.6397, + "step": 5071 + }, + { + "epoch": 2.5163212113689957, + "grad_norm": 0.13884217395526113, + "learning_rate": 1.5167185088276815e-06, + "loss": 0.665, + "step": 5072 + }, + { + "epoch": 2.5168176740722354, + "grad_norm": 0.12603973119926584, + "learning_rate": 1.5158200040399635e-06, + "loss": 0.6631, + "step": 5073 + }, + { + "epoch": 2.5173141367754748, + "grad_norm": 0.1293877404461229, + "learning_rate": 1.5149216496683788e-06, + "loss": 0.7018, + "step": 5074 + }, + { + "epoch": 2.517810599478714, + "grad_norm": 0.12959977324157554, + "learning_rate": 1.514023445850225e-06, + "loss": 0.6999, + "step": 5075 + }, + { + "epoch": 2.5183070621819534, + "grad_norm": 0.13078603859075733, + "learning_rate": 1.513125392722779e-06, + "loss": 0.6854, + "step": 5076 + }, + { + "epoch": 2.518803524885193, + "grad_norm": 0.13366544771971153, + "learning_rate": 1.5122274904232925e-06, + "loss": 0.7107, + "step": 5077 + }, + { + "epoch": 2.5192999875884325, + "grad_norm": 0.1332349279647867, + "learning_rate": 1.5113297390889963e-06, + "loss": 0.7199, + "step": 5078 + }, + { + "epoch": 2.519796450291672, + "grad_norm": 0.12791951008145022, + "learning_rate": 1.5104321388570977e-06, + "loss": 0.6795, + "step": 5079 + }, + { + "epoch": 2.520292912994911, + "grad_norm": 0.1356457836131563, + "learning_rate": 1.50953468986478e-06, + "loss": 0.6902, + "step": 5080 + }, + { + "epoch": 2.5207893756981505, + "grad_norm": 0.12588538763738047, + "learning_rate": 1.5086373922492049e-06, + "loss": 0.6544, + "step": 5081 + }, + { + "epoch": 2.52128583840139, + "grad_norm": 0.1330568928734858, + "learning_rate": 1.5077402461475083e-06, + "loss": 0.6666, + "step": 5082 + }, + { + "epoch": 2.5217823011046296, + "grad_norm": 0.1225633830068647, + "learning_rate": 1.506843251696805e-06, + "loss": 0.6217, + "step": 5083 + }, + { + "epoch": 2.522278763807869, + "grad_norm": 0.12995958837521615, + "learning_rate": 1.505946409034187e-06, + "loss": 0.6922, + "step": 5084 + }, + { + "epoch": 2.5227752265111083, + "grad_norm": 0.12735849858705797, + "learning_rate": 1.5050497182967221e-06, + "loss": 0.6844, + "step": 5085 + }, + { + "epoch": 2.5232716892143476, + "grad_norm": 0.128231127592946, + "learning_rate": 1.5041531796214554e-06, + "loss": 0.7062, + "step": 5086 + }, + { + "epoch": 2.5237681519175874, + "grad_norm": 0.12711845362503602, + "learning_rate": 1.5032567931454073e-06, + "loss": 0.6898, + "step": 5087 + }, + { + "epoch": 2.5242646146208267, + "grad_norm": 0.13608076036992192, + "learning_rate": 1.5023605590055768e-06, + "loss": 0.691, + "step": 5088 + }, + { + "epoch": 2.524761077324066, + "grad_norm": 0.1301368936095168, + "learning_rate": 1.5014644773389391e-06, + "loss": 0.7156, + "step": 5089 + }, + { + "epoch": 2.5252575400273054, + "grad_norm": 0.13293944449573997, + "learning_rate": 1.5005685482824462e-06, + "loss": 0.7482, + "step": 5090 + }, + { + "epoch": 2.5257540027305447, + "grad_norm": 0.12733134898732534, + "learning_rate": 1.499672771973026e-06, + "loss": 0.6956, + "step": 5091 + }, + { + "epoch": 2.526250465433784, + "grad_norm": 0.12354114386385891, + "learning_rate": 1.4987771485475836e-06, + "loss": 0.6575, + "step": 5092 + }, + { + "epoch": 2.526746928137024, + "grad_norm": 0.1256909063925675, + "learning_rate": 1.497881678143e-06, + "loss": 0.6983, + "step": 5093 + }, + { + "epoch": 2.527243390840263, + "grad_norm": 0.12995383983928524, + "learning_rate": 1.4969863608961343e-06, + "loss": 0.7182, + "step": 5094 + }, + { + "epoch": 2.5277398535435025, + "grad_norm": 0.13104868294903146, + "learning_rate": 1.4960911969438213e-06, + "loss": 0.7503, + "step": 5095 + }, + { + "epoch": 2.528236316246742, + "grad_norm": 0.13569502975102957, + "learning_rate": 1.495196186422872e-06, + "loss": 0.6806, + "step": 5096 + }, + { + "epoch": 2.5287327789499816, + "grad_norm": 0.13339083267038424, + "learning_rate": 1.4943013294700758e-06, + "loss": 0.7056, + "step": 5097 + }, + { + "epoch": 2.529229241653221, + "grad_norm": 0.13361409013339606, + "learning_rate": 1.4934066262221954e-06, + "loss": 0.7391, + "step": 5098 + }, + { + "epoch": 2.5297257043564603, + "grad_norm": 0.12680487158505377, + "learning_rate": 1.492512076815973e-06, + "loss": 0.6482, + "step": 5099 + }, + { + "epoch": 2.5302221670596996, + "grad_norm": 0.12746892302880983, + "learning_rate": 1.4916176813881257e-06, + "loss": 0.6522, + "step": 5100 + }, + { + "epoch": 2.530718629762939, + "grad_norm": 0.13139044491664464, + "learning_rate": 1.4907234400753473e-06, + "loss": 0.712, + "step": 5101 + }, + { + "epoch": 2.5312150924661783, + "grad_norm": 0.12799076572044088, + "learning_rate": 1.4898293530143095e-06, + "loss": 0.6722, + "step": 5102 + }, + { + "epoch": 2.531711555169418, + "grad_norm": 0.12797705381984992, + "learning_rate": 1.4889354203416575e-06, + "loss": 0.624, + "step": 5103 + }, + { + "epoch": 2.5322080178726574, + "grad_norm": 0.1355225972612351, + "learning_rate": 1.4880416421940155e-06, + "loss": 0.7376, + "step": 5104 + }, + { + "epoch": 2.5327044805758967, + "grad_norm": 0.1294579579051531, + "learning_rate": 1.4871480187079828e-06, + "loss": 0.7096, + "step": 5105 + }, + { + "epoch": 2.533200943279136, + "grad_norm": 0.13381112874931977, + "learning_rate": 1.4862545500201358e-06, + "loss": 0.7277, + "step": 5106 + }, + { + "epoch": 2.533697405982376, + "grad_norm": 0.1246715734533751, + "learning_rate": 1.4853612362670271e-06, + "loss": 0.6442, + "step": 5107 + }, + { + "epoch": 2.534193868685615, + "grad_norm": 0.12830907556578108, + "learning_rate": 1.4844680775851846e-06, + "loss": 0.6337, + "step": 5108 + }, + { + "epoch": 2.5346903313888545, + "grad_norm": 0.1278448286390337, + "learning_rate": 1.4835750741111139e-06, + "loss": 0.7132, + "step": 5109 + }, + { + "epoch": 2.535186794092094, + "grad_norm": 0.1335432788354155, + "learning_rate": 1.4826822259812957e-06, + "loss": 0.6988, + "step": 5110 + }, + { + "epoch": 2.535683256795333, + "grad_norm": 0.134293152215964, + "learning_rate": 1.4817895333321875e-06, + "loss": 0.7336, + "step": 5111 + }, + { + "epoch": 2.5361797194985725, + "grad_norm": 0.13777460933447475, + "learning_rate": 1.4808969963002234e-06, + "loss": 0.7112, + "step": 5112 + }, + { + "epoch": 2.5366761822018122, + "grad_norm": 0.12897145890973283, + "learning_rate": 1.480004615021814e-06, + "loss": 0.6908, + "step": 5113 + }, + { + "epoch": 2.5371726449050516, + "grad_norm": 0.13400133466081912, + "learning_rate": 1.4791123896333438e-06, + "loss": 0.7064, + "step": 5114 + }, + { + "epoch": 2.537669107608291, + "grad_norm": 0.13211791906308365, + "learning_rate": 1.4782203202711764e-06, + "loss": 0.7026, + "step": 5115 + }, + { + "epoch": 2.5381655703115302, + "grad_norm": 0.13139779720721778, + "learning_rate": 1.4773284070716504e-06, + "loss": 0.7138, + "step": 5116 + }, + { + "epoch": 2.53866203301477, + "grad_norm": 0.13308362459519374, + "learning_rate": 1.4764366501710798e-06, + "loss": 0.7263, + "step": 5117 + }, + { + "epoch": 2.5391584957180093, + "grad_norm": 0.13141118126729773, + "learning_rate": 1.4755450497057563e-06, + "loss": 0.678, + "step": 5118 + }, + { + "epoch": 2.5396549584212487, + "grad_norm": 0.13186502701805516, + "learning_rate": 1.4746536058119454e-06, + "loss": 0.7602, + "step": 5119 + }, + { + "epoch": 2.540151421124488, + "grad_norm": 0.12869479855496058, + "learning_rate": 1.473762318625891e-06, + "loss": 0.7195, + "step": 5120 + }, + { + "epoch": 2.5406478838277273, + "grad_norm": 0.13445753218214146, + "learning_rate": 1.4728711882838115e-06, + "loss": 0.7307, + "step": 5121 + }, + { + "epoch": 2.5411443465309667, + "grad_norm": 0.12705563245150678, + "learning_rate": 1.471980214921902e-06, + "loss": 0.6385, + "step": 5122 + }, + { + "epoch": 2.5416408092342064, + "grad_norm": 0.13028590027975537, + "learning_rate": 1.4710893986763347e-06, + "loss": 0.705, + "step": 5123 + }, + { + "epoch": 2.5421372719374458, + "grad_norm": 0.12817908251977725, + "learning_rate": 1.4701987396832546e-06, + "loss": 0.7177, + "step": 5124 + }, + { + "epoch": 2.542633734640685, + "grad_norm": 0.12496892532709507, + "learning_rate": 1.4693082380787858e-06, + "loss": 0.6705, + "step": 5125 + }, + { + "epoch": 2.5431301973439244, + "grad_norm": 0.13059250706457962, + "learning_rate": 1.4684178939990264e-06, + "loss": 0.691, + "step": 5126 + }, + { + "epoch": 2.543626660047164, + "grad_norm": 0.1292904883407009, + "learning_rate": 1.467527707580052e-06, + "loss": 0.7285, + "step": 5127 + }, + { + "epoch": 2.5441231227504035, + "grad_norm": 0.12910095210438902, + "learning_rate": 1.466637678957913e-06, + "loss": 0.6785, + "step": 5128 + }, + { + "epoch": 2.544619585453643, + "grad_norm": 0.13231894500519423, + "learning_rate": 1.4657478082686363e-06, + "loss": 0.734, + "step": 5129 + }, + { + "epoch": 2.545116048156882, + "grad_norm": 0.13354137515536624, + "learning_rate": 1.4648580956482238e-06, + "loss": 0.6792, + "step": 5130 + }, + { + "epoch": 2.5456125108601215, + "grad_norm": 0.13437657602685885, + "learning_rate": 1.4639685412326543e-06, + "loss": 0.6869, + "step": 5131 + }, + { + "epoch": 2.546108973563361, + "grad_norm": 0.13419591499955405, + "learning_rate": 1.463079145157882e-06, + "loss": 0.7086, + "step": 5132 + }, + { + "epoch": 2.5466054362666006, + "grad_norm": 0.1317550238055129, + "learning_rate": 1.462189907559836e-06, + "loss": 0.7011, + "step": 5133 + }, + { + "epoch": 2.54710189896984, + "grad_norm": 0.12919424179545982, + "learning_rate": 1.4613008285744234e-06, + "loss": 0.7132, + "step": 5134 + }, + { + "epoch": 2.5475983616730793, + "grad_norm": 0.13156727464529705, + "learning_rate": 1.4604119083375242e-06, + "loss": 0.6697, + "step": 5135 + }, + { + "epoch": 2.5480948243763186, + "grad_norm": 0.13198996730417278, + "learning_rate": 1.4595231469849963e-06, + "loss": 0.6547, + "step": 5136 + }, + { + "epoch": 2.5485912870795584, + "grad_norm": 0.13504489395934224, + "learning_rate": 1.4586345446526735e-06, + "loss": 0.7082, + "step": 5137 + }, + { + "epoch": 2.5490877497827977, + "grad_norm": 0.13159429183393376, + "learning_rate": 1.457746101476362e-06, + "loss": 0.7077, + "step": 5138 + }, + { + "epoch": 2.549584212486037, + "grad_norm": 0.1324983434375749, + "learning_rate": 1.4568578175918502e-06, + "loss": 0.6928, + "step": 5139 + }, + { + "epoch": 2.5500806751892764, + "grad_norm": 0.13854603478769614, + "learning_rate": 1.455969693134893e-06, + "loss": 0.7396, + "step": 5140 + }, + { + "epoch": 2.5505771378925157, + "grad_norm": 0.12842830146725112, + "learning_rate": 1.4550817282412293e-06, + "loss": 0.7549, + "step": 5141 + }, + { + "epoch": 2.551073600595755, + "grad_norm": 0.129204728717719, + "learning_rate": 1.454193923046569e-06, + "loss": 0.7016, + "step": 5142 + }, + { + "epoch": 2.551570063298995, + "grad_norm": 0.12750868562867726, + "learning_rate": 1.4533062776866002e-06, + "loss": 0.6743, + "step": 5143 + }, + { + "epoch": 2.552066526002234, + "grad_norm": 0.12654769018231987, + "learning_rate": 1.452418792296984e-06, + "loss": 0.7008, + "step": 5144 + }, + { + "epoch": 2.5525629887054735, + "grad_norm": 0.13141605016356414, + "learning_rate": 1.4515314670133582e-06, + "loss": 0.7169, + "step": 5145 + }, + { + "epoch": 2.553059451408713, + "grad_norm": 0.13379502140501737, + "learning_rate": 1.4506443019713374e-06, + "loss": 0.6779, + "step": 5146 + }, + { + "epoch": 2.5535559141119526, + "grad_norm": 0.13256467196234833, + "learning_rate": 1.4497572973065091e-06, + "loss": 0.7206, + "step": 5147 + }, + { + "epoch": 2.554052376815192, + "grad_norm": 0.13184270552834443, + "learning_rate": 1.4488704531544396e-06, + "loss": 0.6414, + "step": 5148 + }, + { + "epoch": 2.5545488395184313, + "grad_norm": 0.12861101671614525, + "learning_rate": 1.4479837696506677e-06, + "loss": 0.7362, + "step": 5149 + }, + { + "epoch": 2.5550453022216706, + "grad_norm": 0.1262895343147098, + "learning_rate": 1.4470972469307076e-06, + "loss": 0.7061, + "step": 5150 + }, + { + "epoch": 2.55554176492491, + "grad_norm": 0.12872834020768845, + "learning_rate": 1.4462108851300524e-06, + "loss": 0.7172, + "step": 5151 + }, + { + "epoch": 2.5560382276281493, + "grad_norm": 0.13269323839535846, + "learning_rate": 1.4453246843841662e-06, + "loss": 0.7117, + "step": 5152 + }, + { + "epoch": 2.5565346903313886, + "grad_norm": 0.12885571431373338, + "learning_rate": 1.4444386448284925e-06, + "loss": 0.6573, + "step": 5153 + }, + { + "epoch": 2.5570311530346284, + "grad_norm": 0.13280071499296228, + "learning_rate": 1.4435527665984474e-06, + "loss": 0.6934, + "step": 5154 + }, + { + "epoch": 2.5575276157378677, + "grad_norm": 0.13454797554586437, + "learning_rate": 1.442667049829422e-06, + "loss": 0.7246, + "step": 5155 + }, + { + "epoch": 2.558024078441107, + "grad_norm": 0.12697947714667507, + "learning_rate": 1.4417814946567873e-06, + "loss": 0.7333, + "step": 5156 + }, + { + "epoch": 2.558520541144347, + "grad_norm": 0.12516155676293964, + "learning_rate": 1.4408961012158818e-06, + "loss": 0.6683, + "step": 5157 + }, + { + "epoch": 2.559017003847586, + "grad_norm": 0.1302587030682515, + "learning_rate": 1.4400108696420265e-06, + "loss": 0.6953, + "step": 5158 + }, + { + "epoch": 2.5595134665508255, + "grad_norm": 0.12649071492393144, + "learning_rate": 1.4391258000705143e-06, + "loss": 0.6955, + "step": 5159 + }, + { + "epoch": 2.560009929254065, + "grad_norm": 0.1319214562476525, + "learning_rate": 1.4382408926366125e-06, + "loss": 0.7035, + "step": 5160 + }, + { + "epoch": 2.560506391957304, + "grad_norm": 0.12479301266102043, + "learning_rate": 1.4373561474755675e-06, + "loss": 0.6584, + "step": 5161 + }, + { + "epoch": 2.5610028546605434, + "grad_norm": 0.12302337495146001, + "learning_rate": 1.4364715647225963e-06, + "loss": 0.6872, + "step": 5162 + }, + { + "epoch": 2.561499317363783, + "grad_norm": 0.1313391656279115, + "learning_rate": 1.435587144512895e-06, + "loss": 0.7152, + "step": 5163 + }, + { + "epoch": 2.5619957800670226, + "grad_norm": 0.13419103025976656, + "learning_rate": 1.434702886981632e-06, + "loss": 0.707, + "step": 5164 + }, + { + "epoch": 2.562492242770262, + "grad_norm": 0.12781224085601264, + "learning_rate": 1.4338187922639506e-06, + "loss": 0.6512, + "step": 5165 + }, + { + "epoch": 2.562988705473501, + "grad_norm": 0.1269467241970245, + "learning_rate": 1.4329348604949733e-06, + "loss": 0.7062, + "step": 5166 + }, + { + "epoch": 2.563485168176741, + "grad_norm": 0.13011475251943433, + "learning_rate": 1.4320510918097927e-06, + "loss": 0.7054, + "step": 5167 + }, + { + "epoch": 2.5639816308799803, + "grad_norm": 0.12562228736537093, + "learning_rate": 1.4311674863434803e-06, + "loss": 0.682, + "step": 5168 + }, + { + "epoch": 2.5644780935832197, + "grad_norm": 0.1280244371960763, + "learning_rate": 1.43028404423108e-06, + "loss": 0.701, + "step": 5169 + }, + { + "epoch": 2.564974556286459, + "grad_norm": 0.12628433888008067, + "learning_rate": 1.4294007656076108e-06, + "loss": 0.6687, + "step": 5170 + }, + { + "epoch": 2.5654710189896983, + "grad_norm": 0.12632914581609597, + "learning_rate": 1.42851765060807e-06, + "loss": 0.6794, + "step": 5171 + }, + { + "epoch": 2.5659674816929376, + "grad_norm": 0.12626167379165484, + "learning_rate": 1.4276346993674267e-06, + "loss": 0.6737, + "step": 5172 + }, + { + "epoch": 2.566463944396177, + "grad_norm": 0.13881256752440968, + "learning_rate": 1.4267519120206251e-06, + "loss": 0.7359, + "step": 5173 + }, + { + "epoch": 2.5669604070994168, + "grad_norm": 0.13129891241921918, + "learning_rate": 1.425869288702585e-06, + "loss": 0.72, + "step": 5174 + }, + { + "epoch": 2.567456869802656, + "grad_norm": 0.13010534065898433, + "learning_rate": 1.4249868295482021e-06, + "loss": 0.6765, + "step": 5175 + }, + { + "epoch": 2.5679533325058954, + "grad_norm": 0.13578139920958562, + "learning_rate": 1.4241045346923463e-06, + "loss": 0.7659, + "step": 5176 + }, + { + "epoch": 2.568449795209135, + "grad_norm": 0.12966367090684447, + "learning_rate": 1.4232224042698606e-06, + "loss": 0.6878, + "step": 5177 + }, + { + "epoch": 2.5689462579123745, + "grad_norm": 0.1295589151537278, + "learning_rate": 1.4223404384155665e-06, + "loss": 0.6967, + "step": 5178 + }, + { + "epoch": 2.569442720615614, + "grad_norm": 0.12606606232317147, + "learning_rate": 1.4214586372642563e-06, + "loss": 0.6885, + "step": 5179 + }, + { + "epoch": 2.569939183318853, + "grad_norm": 0.12952004740829784, + "learning_rate": 1.4205770009507013e-06, + "loss": 0.7703, + "step": 5180 + }, + { + "epoch": 2.5704356460220925, + "grad_norm": 0.13061476133482602, + "learning_rate": 1.4196955296096449e-06, + "loss": 0.7473, + "step": 5181 + }, + { + "epoch": 2.570932108725332, + "grad_norm": 0.12905289024380742, + "learning_rate": 1.418814223375804e-06, + "loss": 0.7198, + "step": 5182 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.13521590335808284, + "learning_rate": 1.4179330823838749e-06, + "loss": 0.6663, + "step": 5183 + }, + { + "epoch": 2.571925034131811, + "grad_norm": 0.1357147047734266, + "learning_rate": 1.4170521067685234e-06, + "loss": 0.7672, + "step": 5184 + }, + { + "epoch": 2.5724214968350503, + "grad_norm": 0.1420237272434203, + "learning_rate": 1.4161712966643942e-06, + "loss": 0.768, + "step": 5185 + }, + { + "epoch": 2.5729179595382896, + "grad_norm": 0.14090038663886145, + "learning_rate": 1.415290652206105e-06, + "loss": 0.7108, + "step": 5186 + }, + { + "epoch": 2.573414422241529, + "grad_norm": 0.130067394360935, + "learning_rate": 1.4144101735282465e-06, + "loss": 0.6818, + "step": 5187 + }, + { + "epoch": 2.5739108849447687, + "grad_norm": 0.13055899063930992, + "learning_rate": 1.4135298607653885e-06, + "loss": 0.7142, + "step": 5188 + }, + { + "epoch": 2.574407347648008, + "grad_norm": 0.12598384203770444, + "learning_rate": 1.4126497140520696e-06, + "loss": 0.6844, + "step": 5189 + }, + { + "epoch": 2.5749038103512474, + "grad_norm": 0.13099481254462542, + "learning_rate": 1.4117697335228082e-06, + "loss": 0.6484, + "step": 5190 + }, + { + "epoch": 2.5754002730544867, + "grad_norm": 0.12722812757473498, + "learning_rate": 1.410889919312094e-06, + "loss": 0.6863, + "step": 5191 + }, + { + "epoch": 2.575896735757726, + "grad_norm": 0.13021667807315976, + "learning_rate": 1.4100102715543934e-06, + "loss": 0.7062, + "step": 5192 + }, + { + "epoch": 2.5763931984609654, + "grad_norm": 0.13204286958959552, + "learning_rate": 1.4091307903841467e-06, + "loss": 0.7369, + "step": 5193 + }, + { + "epoch": 2.576889661164205, + "grad_norm": 0.12825087423564113, + "learning_rate": 1.4082514759357668e-06, + "loss": 0.6818, + "step": 5194 + }, + { + "epoch": 2.5773861238674445, + "grad_norm": 0.13195934566675152, + "learning_rate": 1.4073723283436447e-06, + "loss": 0.7205, + "step": 5195 + }, + { + "epoch": 2.577882586570684, + "grad_norm": 0.13298051618730106, + "learning_rate": 1.4064933477421435e-06, + "loss": 0.7092, + "step": 5196 + }, + { + "epoch": 2.578379049273923, + "grad_norm": 0.13045643130916357, + "learning_rate": 1.4056145342656002e-06, + "loss": 0.721, + "step": 5197 + }, + { + "epoch": 2.578875511977163, + "grad_norm": 0.1287770798648141, + "learning_rate": 1.4047358880483292e-06, + "loss": 0.6623, + "step": 5198 + }, + { + "epoch": 2.5793719746804022, + "grad_norm": 0.12703038628187402, + "learning_rate": 1.403857409224615e-06, + "loss": 0.6748, + "step": 5199 + }, + { + "epoch": 2.5798684373836416, + "grad_norm": 0.13086879765206233, + "learning_rate": 1.4029790979287217e-06, + "loss": 0.6923, + "step": 5200 + }, + { + "epoch": 2.580364900086881, + "grad_norm": 0.13490815408873744, + "learning_rate": 1.402100954294884e-06, + "loss": 0.7387, + "step": 5201 + }, + { + "epoch": 2.5808613627901202, + "grad_norm": 0.13403369332196752, + "learning_rate": 1.4012229784573111e-06, + "loss": 0.7194, + "step": 5202 + }, + { + "epoch": 2.5813578254933596, + "grad_norm": 0.13057102934580292, + "learning_rate": 1.400345170550189e-06, + "loss": 0.6975, + "step": 5203 + }, + { + "epoch": 2.5818542881965993, + "grad_norm": 0.13217032680650126, + "learning_rate": 1.3994675307076766e-06, + "loss": 0.6857, + "step": 5204 + }, + { + "epoch": 2.5823507508998387, + "grad_norm": 0.12870495240664498, + "learning_rate": 1.3985900590639058e-06, + "loss": 0.6732, + "step": 5205 + }, + { + "epoch": 2.582847213603078, + "grad_norm": 0.1248568293868238, + "learning_rate": 1.397712755752984e-06, + "loss": 0.6753, + "step": 5206 + }, + { + "epoch": 2.5833436763063173, + "grad_norm": 0.1286211773697058, + "learning_rate": 1.3968356209089944e-06, + "loss": 0.698, + "step": 5207 + }, + { + "epoch": 2.583840139009557, + "grad_norm": 0.12476162094736613, + "learning_rate": 1.3959586546659926e-06, + "loss": 0.6571, + "step": 5208 + }, + { + "epoch": 2.5843366017127964, + "grad_norm": 0.13256034900710464, + "learning_rate": 1.3950818571580071e-06, + "loss": 0.7052, + "step": 5209 + }, + { + "epoch": 2.5848330644160358, + "grad_norm": 0.14170229500379294, + "learning_rate": 1.3942052285190453e-06, + "loss": 0.7395, + "step": 5210 + }, + { + "epoch": 2.585329527119275, + "grad_norm": 0.13306092777696724, + "learning_rate": 1.3933287688830827e-06, + "loss": 0.7503, + "step": 5211 + }, + { + "epoch": 2.5858259898225144, + "grad_norm": 0.12983382658565437, + "learning_rate": 1.3924524783840748e-06, + "loss": 0.6702, + "step": 5212 + }, + { + "epoch": 2.5863224525257538, + "grad_norm": 0.12964608984313164, + "learning_rate": 1.3915763571559477e-06, + "loss": 0.7302, + "step": 5213 + }, + { + "epoch": 2.5868189152289935, + "grad_norm": 0.130514060454796, + "learning_rate": 1.3907004053326006e-06, + "loss": 0.7262, + "step": 5214 + }, + { + "epoch": 2.587315377932233, + "grad_norm": 0.1339908897788451, + "learning_rate": 1.3898246230479119e-06, + "loss": 0.7146, + "step": 5215 + }, + { + "epoch": 2.587811840635472, + "grad_norm": 0.12515165316595245, + "learning_rate": 1.3889490104357278e-06, + "loss": 0.6528, + "step": 5216 + }, + { + "epoch": 2.5883083033387115, + "grad_norm": 0.13700532883591948, + "learning_rate": 1.3880735676298743e-06, + "loss": 0.7071, + "step": 5217 + }, + { + "epoch": 2.5888047660419513, + "grad_norm": 0.130354785696436, + "learning_rate": 1.3871982947641478e-06, + "loss": 0.6935, + "step": 5218 + }, + { + "epoch": 2.5893012287451906, + "grad_norm": 0.13701897641660335, + "learning_rate": 1.386323191972318e-06, + "loss": 0.7427, + "step": 5219 + }, + { + "epoch": 2.58979769144843, + "grad_norm": 0.1320735550699063, + "learning_rate": 1.3854482593881342e-06, + "loss": 0.7064, + "step": 5220 + }, + { + "epoch": 2.5902941541516693, + "grad_norm": 0.12715230958238605, + "learning_rate": 1.3845734971453114e-06, + "loss": 0.6718, + "step": 5221 + }, + { + "epoch": 2.5907906168549086, + "grad_norm": 0.13018988366015433, + "learning_rate": 1.3836989053775462e-06, + "loss": 0.7509, + "step": 5222 + }, + { + "epoch": 2.591287079558148, + "grad_norm": 0.13221945422649725, + "learning_rate": 1.3828244842185034e-06, + "loss": 0.7166, + "step": 5223 + }, + { + "epoch": 2.5917835422613877, + "grad_norm": 0.1304668288729789, + "learning_rate": 1.381950233801827e-06, + "loss": 0.6849, + "step": 5224 + }, + { + "epoch": 2.592280004964627, + "grad_norm": 0.12573646819731626, + "learning_rate": 1.3810761542611306e-06, + "loss": 0.6589, + "step": 5225 + }, + { + "epoch": 2.5927764676678664, + "grad_norm": 0.13280192854756245, + "learning_rate": 1.380202245730003e-06, + "loss": 0.7207, + "step": 5226 + }, + { + "epoch": 2.5932729303711057, + "grad_norm": 0.13071113755387934, + "learning_rate": 1.3793285083420077e-06, + "loss": 0.7014, + "step": 5227 + }, + { + "epoch": 2.5937693930743455, + "grad_norm": 0.12860260319733735, + "learning_rate": 1.3784549422306808e-06, + "loss": 0.6795, + "step": 5228 + }, + { + "epoch": 2.594265855777585, + "grad_norm": 0.1270646298734641, + "learning_rate": 1.3775815475295343e-06, + "loss": 0.6719, + "step": 5229 + }, + { + "epoch": 2.594762318480824, + "grad_norm": 0.131427270856042, + "learning_rate": 1.3767083243720516e-06, + "loss": 0.7102, + "step": 5230 + }, + { + "epoch": 2.5952587811840635, + "grad_norm": 0.1291160109583672, + "learning_rate": 1.37583527289169e-06, + "loss": 0.6879, + "step": 5231 + }, + { + "epoch": 2.595755243887303, + "grad_norm": 0.12483862070084152, + "learning_rate": 1.374962393221883e-06, + "loss": 0.6806, + "step": 5232 + }, + { + "epoch": 2.596251706590542, + "grad_norm": 0.1265687299366481, + "learning_rate": 1.3740896854960361e-06, + "loss": 0.6648, + "step": 5233 + }, + { + "epoch": 2.596748169293782, + "grad_norm": 0.12551276631561484, + "learning_rate": 1.3732171498475269e-06, + "loss": 0.6796, + "step": 5234 + }, + { + "epoch": 2.5972446319970213, + "grad_norm": 0.1919668140078233, + "learning_rate": 1.3723447864097105e-06, + "loss": 0.7173, + "step": 5235 + }, + { + "epoch": 2.5977410947002606, + "grad_norm": 0.13071121709395, + "learning_rate": 1.3714725953159136e-06, + "loss": 0.725, + "step": 5236 + }, + { + "epoch": 2.5982375574035, + "grad_norm": 0.12628300917307875, + "learning_rate": 1.3706005766994354e-06, + "loss": 0.6583, + "step": 5237 + }, + { + "epoch": 2.5987340201067397, + "grad_norm": 0.1324112740619143, + "learning_rate": 1.3697287306935498e-06, + "loss": 0.7062, + "step": 5238 + }, + { + "epoch": 2.599230482809979, + "grad_norm": 0.13328148315750796, + "learning_rate": 1.3688570574315058e-06, + "loss": 0.7195, + "step": 5239 + }, + { + "epoch": 2.5997269455132184, + "grad_norm": 0.1392294677729809, + "learning_rate": 1.3679855570465244e-06, + "loss": 0.7264, + "step": 5240 + }, + { + "epoch": 2.6002234082164577, + "grad_norm": 0.1257272610807176, + "learning_rate": 1.367114229671799e-06, + "loss": 0.6915, + "step": 5241 + }, + { + "epoch": 2.600719870919697, + "grad_norm": 0.12811010695255046, + "learning_rate": 1.3662430754405004e-06, + "loss": 0.7125, + "step": 5242 + }, + { + "epoch": 2.6012163336229364, + "grad_norm": 0.1329892421572713, + "learning_rate": 1.365372094485768e-06, + "loss": 0.7393, + "step": 5243 + }, + { + "epoch": 2.601712796326176, + "grad_norm": 0.1293006710207882, + "learning_rate": 1.36450128694072e-06, + "loss": 0.6771, + "step": 5244 + }, + { + "epoch": 2.6022092590294155, + "grad_norm": 0.13954401243263015, + "learning_rate": 1.3636306529384432e-06, + "loss": 0.7291, + "step": 5245 + }, + { + "epoch": 2.602705721732655, + "grad_norm": 0.12768761269525258, + "learning_rate": 1.3627601926120005e-06, + "loss": 0.6687, + "step": 5246 + }, + { + "epoch": 2.603202184435894, + "grad_norm": 0.1329893303483065, + "learning_rate": 1.3618899060944286e-06, + "loss": 0.7366, + "step": 5247 + }, + { + "epoch": 2.603698647139134, + "grad_norm": 0.13009174728461212, + "learning_rate": 1.3610197935187358e-06, + "loss": 0.6713, + "step": 5248 + }, + { + "epoch": 2.6041951098423732, + "grad_norm": 0.12802975892687476, + "learning_rate": 1.3601498550179059e-06, + "loss": 0.7355, + "step": 5249 + }, + { + "epoch": 2.6046915725456126, + "grad_norm": 0.1297273904569148, + "learning_rate": 1.3592800907248949e-06, + "loss": 0.7098, + "step": 5250 + }, + { + "epoch": 2.605188035248852, + "grad_norm": 0.12806246500929, + "learning_rate": 1.3584105007726312e-06, + "loss": 0.6429, + "step": 5251 + }, + { + "epoch": 2.6056844979520912, + "grad_norm": 0.13041176910308638, + "learning_rate": 1.3575410852940202e-06, + "loss": 0.6807, + "step": 5252 + }, + { + "epoch": 2.6061809606553306, + "grad_norm": 0.12886597784723972, + "learning_rate": 1.3566718444219342e-06, + "loss": 0.7237, + "step": 5253 + }, + { + "epoch": 2.6066774233585703, + "grad_norm": 0.12916633911425887, + "learning_rate": 1.355802778289226e-06, + "loss": 0.6739, + "step": 5254 + }, + { + "epoch": 2.6071738860618097, + "grad_norm": 0.1287678914063443, + "learning_rate": 1.3549338870287165e-06, + "loss": 0.6716, + "step": 5255 + }, + { + "epoch": 2.607670348765049, + "grad_norm": 0.13212685032768234, + "learning_rate": 1.3540651707732036e-06, + "loss": 0.7382, + "step": 5256 + }, + { + "epoch": 2.6081668114682883, + "grad_norm": 0.12905546375739468, + "learning_rate": 1.3531966296554555e-06, + "loss": 0.6551, + "step": 5257 + }, + { + "epoch": 2.608663274171528, + "grad_norm": 0.12906147644682286, + "learning_rate": 1.3523282638082142e-06, + "loss": 0.7328, + "step": 5258 + }, + { + "epoch": 2.6091597368747674, + "grad_norm": 0.1313108855974763, + "learning_rate": 1.3514600733641969e-06, + "loss": 0.6417, + "step": 5259 + }, + { + "epoch": 2.6096561995780068, + "grad_norm": 0.12974355187493028, + "learning_rate": 1.3505920584560913e-06, + "loss": 0.6928, + "step": 5260 + }, + { + "epoch": 2.610152662281246, + "grad_norm": 0.12727154967902357, + "learning_rate": 1.349724219216561e-06, + "loss": 0.669, + "step": 5261 + }, + { + "epoch": 2.6106491249844854, + "grad_norm": 0.12535216696134888, + "learning_rate": 1.3488565557782407e-06, + "loss": 0.682, + "step": 5262 + }, + { + "epoch": 2.6111455876877248, + "grad_norm": 0.13133855825746554, + "learning_rate": 1.347989068273738e-06, + "loss": 0.7084, + "step": 5263 + }, + { + "epoch": 2.6116420503909645, + "grad_norm": 0.13110682521576933, + "learning_rate": 1.3471217568356354e-06, + "loss": 0.7168, + "step": 5264 + }, + { + "epoch": 2.612138513094204, + "grad_norm": 0.12162732379351464, + "learning_rate": 1.3462546215964867e-06, + "loss": 0.6539, + "step": 5265 + }, + { + "epoch": 2.612634975797443, + "grad_norm": 0.13230036317983876, + "learning_rate": 1.345387662688821e-06, + "loss": 0.7189, + "step": 5266 + }, + { + "epoch": 2.6131314385006825, + "grad_norm": 0.12335108832614605, + "learning_rate": 1.3445208802451383e-06, + "loss": 0.6612, + "step": 5267 + }, + { + "epoch": 2.6136279012039223, + "grad_norm": 0.13204613627760084, + "learning_rate": 1.3436542743979125e-06, + "loss": 0.7287, + "step": 5268 + }, + { + "epoch": 2.6141243639071616, + "grad_norm": 0.12765603511898485, + "learning_rate": 1.34278784527959e-06, + "loss": 0.6788, + "step": 5269 + }, + { + "epoch": 2.614620826610401, + "grad_norm": 0.13326553938549146, + "learning_rate": 1.3419215930225898e-06, + "loss": 0.6903, + "step": 5270 + }, + { + "epoch": 2.6151172893136403, + "grad_norm": 0.13006724151910573, + "learning_rate": 1.341055517759307e-06, + "loss": 0.6968, + "step": 5271 + }, + { + "epoch": 2.6156137520168796, + "grad_norm": 0.12782169012677588, + "learning_rate": 1.3401896196221061e-06, + "loss": 0.6402, + "step": 5272 + }, + { + "epoch": 2.616110214720119, + "grad_norm": 0.1298644050131224, + "learning_rate": 1.3393238987433247e-06, + "loss": 0.6786, + "step": 5273 + }, + { + "epoch": 2.6166066774233587, + "grad_norm": 0.13093853621883342, + "learning_rate": 1.338458355255276e-06, + "loss": 0.7008, + "step": 5274 + }, + { + "epoch": 2.617103140126598, + "grad_norm": 0.1251589926214746, + "learning_rate": 1.3375929892902435e-06, + "loss": 0.6642, + "step": 5275 + }, + { + "epoch": 2.6175996028298374, + "grad_norm": 0.1332356566343829, + "learning_rate": 1.3367278009804852e-06, + "loss": 0.6859, + "step": 5276 + }, + { + "epoch": 2.6180960655330767, + "grad_norm": 0.1351996093141888, + "learning_rate": 1.3358627904582308e-06, + "loss": 0.6886, + "step": 5277 + }, + { + "epoch": 2.6185925282363165, + "grad_norm": 0.12760964101813727, + "learning_rate": 1.3349979578556827e-06, + "loss": 0.6775, + "step": 5278 + }, + { + "epoch": 2.619088990939556, + "grad_norm": 0.13492506375570365, + "learning_rate": 1.334133303305018e-06, + "loss": 0.7233, + "step": 5279 + }, + { + "epoch": 2.619585453642795, + "grad_norm": 0.13103285327539846, + "learning_rate": 1.3332688269383842e-06, + "loss": 0.729, + "step": 5280 + }, + { + "epoch": 2.6200819163460345, + "grad_norm": 0.12874709415836544, + "learning_rate": 1.3324045288879034e-06, + "loss": 0.7383, + "step": 5281 + }, + { + "epoch": 2.620578379049274, + "grad_norm": 0.12537613283777926, + "learning_rate": 1.33154040928567e-06, + "loss": 0.6694, + "step": 5282 + }, + { + "epoch": 2.621074841752513, + "grad_norm": 0.1229814161755244, + "learning_rate": 1.3306764682637487e-06, + "loss": 0.642, + "step": 5283 + }, + { + "epoch": 2.621571304455753, + "grad_norm": 0.13180449979729172, + "learning_rate": 1.329812705954183e-06, + "loss": 0.7413, + "step": 5284 + }, + { + "epoch": 2.6220677671589923, + "grad_norm": 0.1299850828781537, + "learning_rate": 1.3289491224889805e-06, + "loss": 0.7245, + "step": 5285 + }, + { + "epoch": 2.6225642298622316, + "grad_norm": 0.12657003185254417, + "learning_rate": 1.328085718000129e-06, + "loss": 0.691, + "step": 5286 + }, + { + "epoch": 2.623060692565471, + "grad_norm": 0.12873747042196176, + "learning_rate": 1.3272224926195847e-06, + "loss": 0.6719, + "step": 5287 + }, + { + "epoch": 2.6235571552687107, + "grad_norm": 0.12986302258803353, + "learning_rate": 1.326359446479279e-06, + "loss": 0.6856, + "step": 5288 + }, + { + "epoch": 2.62405361797195, + "grad_norm": 0.13177548514946316, + "learning_rate": 1.325496579711114e-06, + "loss": 0.7408, + "step": 5289 + }, + { + "epoch": 2.6245500806751894, + "grad_norm": 0.13080981754895, + "learning_rate": 1.3246338924469646e-06, + "loss": 0.6891, + "step": 5290 + }, + { + "epoch": 2.6250465433784287, + "grad_norm": 0.1319670698468296, + "learning_rate": 1.3237713848186799e-06, + "loss": 0.7171, + "step": 5291 + }, + { + "epoch": 2.625543006081668, + "grad_norm": 0.1268286576276871, + "learning_rate": 1.3229090569580782e-06, + "loss": 0.7105, + "step": 5292 + }, + { + "epoch": 2.6260394687849073, + "grad_norm": 0.13031273872848315, + "learning_rate": 1.3220469089969556e-06, + "loss": 0.6647, + "step": 5293 + }, + { + "epoch": 2.6265359314881467, + "grad_norm": 0.12705509580073604, + "learning_rate": 1.3211849410670755e-06, + "loss": 0.6708, + "step": 5294 + }, + { + "epoch": 2.6270323941913865, + "grad_norm": 0.12662469871597498, + "learning_rate": 1.3203231533001753e-06, + "loss": 0.7167, + "step": 5295 + }, + { + "epoch": 2.627528856894626, + "grad_norm": 0.1351319977157379, + "learning_rate": 1.3194615458279675e-06, + "loss": 0.7027, + "step": 5296 + }, + { + "epoch": 2.628025319597865, + "grad_norm": 0.1293833244334666, + "learning_rate": 1.3186001187821328e-06, + "loss": 0.737, + "step": 5297 + }, + { + "epoch": 2.628521782301105, + "grad_norm": 0.12681644363417152, + "learning_rate": 1.317738872294329e-06, + "loss": 0.687, + "step": 5298 + }, + { + "epoch": 2.6290182450043442, + "grad_norm": 0.12782685543955466, + "learning_rate": 1.3168778064961824e-06, + "loss": 0.6944, + "step": 5299 + }, + { + "epoch": 2.6295147077075836, + "grad_norm": 0.1283732539871523, + "learning_rate": 1.3160169215192929e-06, + "loss": 0.6593, + "step": 5300 + }, + { + "epoch": 2.630011170410823, + "grad_norm": 0.12786728676839182, + "learning_rate": 1.315156217495233e-06, + "loss": 0.6914, + "step": 5301 + }, + { + "epoch": 2.630507633114062, + "grad_norm": 0.12640922642050675, + "learning_rate": 1.3142956945555474e-06, + "loss": 0.6319, + "step": 5302 + }, + { + "epoch": 2.6310040958173015, + "grad_norm": 0.12770955937417205, + "learning_rate": 1.3134353528317539e-06, + "loss": 0.6829, + "step": 5303 + }, + { + "epoch": 2.631500558520541, + "grad_norm": 0.12759140923876178, + "learning_rate": 1.312575192455341e-06, + "loss": 0.7206, + "step": 5304 + }, + { + "epoch": 2.6319970212237807, + "grad_norm": 0.13233654333615097, + "learning_rate": 1.3117152135577721e-06, + "loss": 0.7442, + "step": 5305 + }, + { + "epoch": 2.63249348392702, + "grad_norm": 0.13201851791859284, + "learning_rate": 1.3108554162704797e-06, + "loss": 0.7381, + "step": 5306 + }, + { + "epoch": 2.6329899466302593, + "grad_norm": 0.13006021868404868, + "learning_rate": 1.3099958007248698e-06, + "loss": 0.685, + "step": 5307 + }, + { + "epoch": 2.633486409333499, + "grad_norm": 0.1308979224400638, + "learning_rate": 1.3091363670523225e-06, + "loss": 0.6923, + "step": 5308 + }, + { + "epoch": 2.6339828720367384, + "grad_norm": 0.13246572766620834, + "learning_rate": 1.3082771153841872e-06, + "loss": 0.7535, + "step": 5309 + }, + { + "epoch": 2.6344793347399778, + "grad_norm": 0.13028781856699762, + "learning_rate": 1.307418045851786e-06, + "loss": 0.6802, + "step": 5310 + }, + { + "epoch": 2.634975797443217, + "grad_norm": 0.1291552670036102, + "learning_rate": 1.3065591585864161e-06, + "loss": 0.7137, + "step": 5311 + }, + { + "epoch": 2.6354722601464564, + "grad_norm": 0.1297984636190616, + "learning_rate": 1.3057004537193424e-06, + "loss": 0.6697, + "step": 5312 + }, + { + "epoch": 2.6359687228496957, + "grad_norm": 0.12833411800480857, + "learning_rate": 1.3048419313818062e-06, + "loss": 0.6836, + "step": 5313 + }, + { + "epoch": 2.636465185552935, + "grad_norm": 0.13042366934420158, + "learning_rate": 1.3039835917050177e-06, + "loss": 0.6944, + "step": 5314 + }, + { + "epoch": 2.636961648256175, + "grad_norm": 0.13642883453725888, + "learning_rate": 1.30312543482016e-06, + "loss": 0.7166, + "step": 5315 + }, + { + "epoch": 2.637458110959414, + "grad_norm": 0.130255734346025, + "learning_rate": 1.3022674608583907e-06, + "loss": 0.7537, + "step": 5316 + }, + { + "epoch": 2.6379545736626535, + "grad_norm": 0.13076117414943525, + "learning_rate": 1.3014096699508338e-06, + "loss": 0.7065, + "step": 5317 + }, + { + "epoch": 2.6384510363658933, + "grad_norm": 0.13062995849439213, + "learning_rate": 1.3005520622285922e-06, + "loss": 0.6623, + "step": 5318 + }, + { + "epoch": 2.6389474990691326, + "grad_norm": 0.12821638120486448, + "learning_rate": 1.2996946378227351e-06, + "loss": 0.6664, + "step": 5319 + }, + { + "epoch": 2.639443961772372, + "grad_norm": 0.133639093888335, + "learning_rate": 1.298837396864308e-06, + "loss": 0.75, + "step": 5320 + }, + { + "epoch": 2.6399404244756113, + "grad_norm": 0.12479238215751826, + "learning_rate": 1.297980339484326e-06, + "loss": 0.6877, + "step": 5321 + }, + { + "epoch": 2.6404368871788506, + "grad_norm": 0.13212736303956885, + "learning_rate": 1.297123465813775e-06, + "loss": 0.7034, + "step": 5322 + }, + { + "epoch": 2.64093334988209, + "grad_norm": 0.13499589181985006, + "learning_rate": 1.2962667759836166e-06, + "loss": 0.721, + "step": 5323 + }, + { + "epoch": 2.6414298125853293, + "grad_norm": 0.12490032994276366, + "learning_rate": 1.2954102701247801e-06, + "loss": 0.66, + "step": 5324 + }, + { + "epoch": 2.641926275288569, + "grad_norm": 0.14001714407558724, + "learning_rate": 1.2945539483681708e-06, + "loss": 0.6848, + "step": 5325 + }, + { + "epoch": 2.6424227379918084, + "grad_norm": 0.13601215073821113, + "learning_rate": 1.2936978108446624e-06, + "loss": 0.7245, + "step": 5326 + }, + { + "epoch": 2.6429192006950477, + "grad_norm": 0.12605012170458385, + "learning_rate": 1.292841857685101e-06, + "loss": 0.6667, + "step": 5327 + }, + { + "epoch": 2.643415663398287, + "grad_norm": 0.12872625874119725, + "learning_rate": 1.2919860890203073e-06, + "loss": 0.6868, + "step": 5328 + }, + { + "epoch": 2.643912126101527, + "grad_norm": 0.13008341539653537, + "learning_rate": 1.2911305049810701e-06, + "loss": 0.6629, + "step": 5329 + }, + { + "epoch": 2.644408588804766, + "grad_norm": 0.12691494726221766, + "learning_rate": 1.2902751056981533e-06, + "loss": 0.6905, + "step": 5330 + }, + { + "epoch": 2.6449050515080055, + "grad_norm": 0.1238746354104558, + "learning_rate": 1.2894198913022903e-06, + "loss": 0.6731, + "step": 5331 + }, + { + "epoch": 2.645401514211245, + "grad_norm": 0.13315315264555727, + "learning_rate": 1.2885648619241866e-06, + "loss": 0.699, + "step": 5332 + }, + { + "epoch": 2.645897976914484, + "grad_norm": 0.1313601573492306, + "learning_rate": 1.28771001769452e-06, + "loss": 0.6966, + "step": 5333 + }, + { + "epoch": 2.6463944396177235, + "grad_norm": 0.12904285233884105, + "learning_rate": 1.2868553587439386e-06, + "loss": 0.6657, + "step": 5334 + }, + { + "epoch": 2.6468909023209632, + "grad_norm": 0.1276652699366476, + "learning_rate": 1.2860008852030653e-06, + "loss": 0.7054, + "step": 5335 + }, + { + "epoch": 2.6473873650242026, + "grad_norm": 0.12726233644290474, + "learning_rate": 1.2851465972024908e-06, + "loss": 0.6983, + "step": 5336 + }, + { + "epoch": 2.647883827727442, + "grad_norm": 0.128524573267683, + "learning_rate": 1.2842924948727809e-06, + "loss": 0.6639, + "step": 5337 + }, + { + "epoch": 2.6483802904306812, + "grad_norm": 0.1354082209207073, + "learning_rate": 1.2834385783444708e-06, + "loss": 0.7063, + "step": 5338 + }, + { + "epoch": 2.648876753133921, + "grad_norm": 0.1286859619680868, + "learning_rate": 1.282584847748067e-06, + "loss": 0.6669, + "step": 5339 + }, + { + "epoch": 2.6493732158371603, + "grad_norm": 0.12527601100039995, + "learning_rate": 1.2817313032140504e-06, + "loss": 0.6949, + "step": 5340 + }, + { + "epoch": 2.6498696785403997, + "grad_norm": 0.13317900586790732, + "learning_rate": 1.2808779448728701e-06, + "loss": 0.6903, + "step": 5341 + }, + { + "epoch": 2.650366141243639, + "grad_norm": 0.1339289905433541, + "learning_rate": 1.2800247728549492e-06, + "loss": 0.707, + "step": 5342 + }, + { + "epoch": 2.6508626039468783, + "grad_norm": 0.13121373346042195, + "learning_rate": 1.2791717872906812e-06, + "loss": 0.7315, + "step": 5343 + }, + { + "epoch": 2.6513590666501177, + "grad_norm": 0.13089098565705848, + "learning_rate": 1.2783189883104301e-06, + "loss": 0.6792, + "step": 5344 + }, + { + "epoch": 2.6518555293533574, + "grad_norm": 0.12878111758105115, + "learning_rate": 1.2774663760445343e-06, + "loss": 0.6792, + "step": 5345 + }, + { + "epoch": 2.6523519920565968, + "grad_norm": 0.14725422361310098, + "learning_rate": 1.2766139506233012e-06, + "loss": 0.6749, + "step": 5346 + }, + { + "epoch": 2.652848454759836, + "grad_norm": 0.1301170341633137, + "learning_rate": 1.2757617121770093e-06, + "loss": 0.6623, + "step": 5347 + }, + { + "epoch": 2.6533449174630754, + "grad_norm": 0.1317723532692615, + "learning_rate": 1.2749096608359124e-06, + "loss": 0.7481, + "step": 5348 + }, + { + "epoch": 2.653841380166315, + "grad_norm": 0.12676960816569183, + "learning_rate": 1.2740577967302292e-06, + "loss": 0.6932, + "step": 5349 + }, + { + "epoch": 2.6543378428695545, + "grad_norm": 0.12671113631004585, + "learning_rate": 1.2732061199901563e-06, + "loss": 0.6807, + "step": 5350 + }, + { + "epoch": 2.654834305572794, + "grad_norm": 0.13126321692463977, + "learning_rate": 1.2723546307458564e-06, + "loss": 0.7329, + "step": 5351 + }, + { + "epoch": 2.655330768276033, + "grad_norm": 0.1264003258300388, + "learning_rate": 1.2715033291274686e-06, + "loss": 0.7149, + "step": 5352 + }, + { + "epoch": 2.6558272309792725, + "grad_norm": 0.13089334621865706, + "learning_rate": 1.2706522152650997e-06, + "loss": 0.7176, + "step": 5353 + }, + { + "epoch": 2.656323693682512, + "grad_norm": 0.13062906649658093, + "learning_rate": 1.2698012892888272e-06, + "loss": 0.6898, + "step": 5354 + }, + { + "epoch": 2.6568201563857516, + "grad_norm": 0.12660598434158638, + "learning_rate": 1.2689505513287042e-06, + "loss": 0.6998, + "step": 5355 + }, + { + "epoch": 2.657316619088991, + "grad_norm": 0.13240501373271096, + "learning_rate": 1.2681000015147505e-06, + "loss": 0.7185, + "step": 5356 + }, + { + "epoch": 2.6578130817922303, + "grad_norm": 0.13214361476854886, + "learning_rate": 1.2672496399769596e-06, + "loss": 0.717, + "step": 5357 + }, + { + "epoch": 2.6583095444954696, + "grad_norm": 0.12662065221064372, + "learning_rate": 1.2663994668452961e-06, + "loss": 0.6901, + "step": 5358 + }, + { + "epoch": 2.6588060071987094, + "grad_norm": 0.12807260004668625, + "learning_rate": 1.2655494822496938e-06, + "loss": 0.7518, + "step": 5359 + }, + { + "epoch": 2.6593024699019487, + "grad_norm": 0.12843060423085104, + "learning_rate": 1.2646996863200612e-06, + "loss": 0.6849, + "step": 5360 + }, + { + "epoch": 2.659798932605188, + "grad_norm": 0.13206054308179124, + "learning_rate": 1.263850079186274e-06, + "loss": 0.7003, + "step": 5361 + }, + { + "epoch": 2.6602953953084274, + "grad_norm": 0.1374628315367967, + "learning_rate": 1.2630006609781832e-06, + "loss": 0.7265, + "step": 5362 + }, + { + "epoch": 2.6607918580116667, + "grad_norm": 0.133031557785691, + "learning_rate": 1.2621514318256073e-06, + "loss": 0.728, + "step": 5363 + }, + { + "epoch": 2.661288320714906, + "grad_norm": 0.13243919879974533, + "learning_rate": 1.2613023918583379e-06, + "loss": 0.699, + "step": 5364 + }, + { + "epoch": 2.661784783418146, + "grad_norm": 0.13056450670934303, + "learning_rate": 1.2604535412061367e-06, + "loss": 0.6934, + "step": 5365 + }, + { + "epoch": 2.662281246121385, + "grad_norm": 0.1327556834023822, + "learning_rate": 1.259604879998736e-06, + "loss": 0.6961, + "step": 5366 + }, + { + "epoch": 2.6627777088246245, + "grad_norm": 0.12818630552200058, + "learning_rate": 1.2587564083658424e-06, + "loss": 0.7106, + "step": 5367 + }, + { + "epoch": 2.663274171527864, + "grad_norm": 0.1310766749031488, + "learning_rate": 1.257908126437129e-06, + "loss": 0.7148, + "step": 5368 + }, + { + "epoch": 2.6637706342311036, + "grad_norm": 0.127265232183585, + "learning_rate": 1.257060034342244e-06, + "loss": 0.6936, + "step": 5369 + }, + { + "epoch": 2.664267096934343, + "grad_norm": 0.13015485116988296, + "learning_rate": 1.2562121322108033e-06, + "loss": 0.6881, + "step": 5370 + }, + { + "epoch": 2.6647635596375823, + "grad_norm": 0.12987369619590036, + "learning_rate": 1.2553644201723953e-06, + "loss": 0.6782, + "step": 5371 + }, + { + "epoch": 2.6652600223408216, + "grad_norm": 0.1286930218309498, + "learning_rate": 1.25451689835658e-06, + "loss": 0.7157, + "step": 5372 + }, + { + "epoch": 2.665756485044061, + "grad_norm": 0.13613822958082458, + "learning_rate": 1.2536695668928861e-06, + "loss": 0.7231, + "step": 5373 + }, + { + "epoch": 2.6662529477473003, + "grad_norm": 0.13210551836350637, + "learning_rate": 1.2528224259108165e-06, + "loss": 0.7344, + "step": 5374 + }, + { + "epoch": 2.66674941045054, + "grad_norm": 0.12948567861798826, + "learning_rate": 1.2519754755398422e-06, + "loss": 0.7031, + "step": 5375 + }, + { + "epoch": 2.6672458731537794, + "grad_norm": 0.1267189214191555, + "learning_rate": 1.251128715909405e-06, + "loss": 0.7144, + "step": 5376 + }, + { + "epoch": 2.6677423358570187, + "grad_norm": 0.12858865733135458, + "learning_rate": 1.25028214714892e-06, + "loss": 0.6898, + "step": 5377 + }, + { + "epoch": 2.668238798560258, + "grad_norm": 0.1259741786554077, + "learning_rate": 1.2494357693877707e-06, + "loss": 0.6758, + "step": 5378 + }, + { + "epoch": 2.668735261263498, + "grad_norm": 0.13563486865602353, + "learning_rate": 1.2485895827553132e-06, + "loss": 0.7236, + "step": 5379 + }, + { + "epoch": 2.669231723966737, + "grad_norm": 0.13321717377807327, + "learning_rate": 1.2477435873808736e-06, + "loss": 0.7188, + "step": 5380 + }, + { + "epoch": 2.6697281866699765, + "grad_norm": 0.13087033023193448, + "learning_rate": 1.246897783393748e-06, + "loss": 0.7512, + "step": 5381 + }, + { + "epoch": 2.670224649373216, + "grad_norm": 0.13658879188594283, + "learning_rate": 1.2460521709232042e-06, + "loss": 0.6728, + "step": 5382 + }, + { + "epoch": 2.670721112076455, + "grad_norm": 0.13628616113751904, + "learning_rate": 1.2452067500984797e-06, + "loss": 0.6972, + "step": 5383 + }, + { + "epoch": 2.6712175747796945, + "grad_norm": 0.13793409627064301, + "learning_rate": 1.2443615210487853e-06, + "loss": 0.7301, + "step": 5384 + }, + { + "epoch": 2.6717140374829342, + "grad_norm": 0.12840778349061024, + "learning_rate": 1.2435164839032999e-06, + "loss": 0.6916, + "step": 5385 + }, + { + "epoch": 2.6722105001861736, + "grad_norm": 0.13033494926277364, + "learning_rate": 1.2426716387911728e-06, + "loss": 0.6811, + "step": 5386 + }, + { + "epoch": 2.672706962889413, + "grad_norm": 0.13130268215437454, + "learning_rate": 1.2418269858415267e-06, + "loss": 0.6996, + "step": 5387 + }, + { + "epoch": 2.6732034255926522, + "grad_norm": 0.12904504134003447, + "learning_rate": 1.2409825251834518e-06, + "loss": 0.6958, + "step": 5388 + }, + { + "epoch": 2.673699888295892, + "grad_norm": 0.12717110744137422, + "learning_rate": 1.2401382569460118e-06, + "loss": 0.6495, + "step": 5389 + }, + { + "epoch": 2.6741963509991313, + "grad_norm": 0.1318115465132516, + "learning_rate": 1.239294181258239e-06, + "loss": 0.6998, + "step": 5390 + }, + { + "epoch": 2.6746928137023707, + "grad_norm": 0.133476545289, + "learning_rate": 1.2384502982491359e-06, + "loss": 0.7364, + "step": 5391 + }, + { + "epoch": 2.67518927640561, + "grad_norm": 0.13316247822909816, + "learning_rate": 1.237606608047678e-06, + "loss": 0.7539, + "step": 5392 + }, + { + "epoch": 2.6756857391088493, + "grad_norm": 0.13762117719884678, + "learning_rate": 1.2367631107828086e-06, + "loss": 0.7731, + "step": 5393 + }, + { + "epoch": 2.6761822018120887, + "grad_norm": 0.1294653958734037, + "learning_rate": 1.2359198065834439e-06, + "loss": 0.7375, + "step": 5394 + }, + { + "epoch": 2.6766786645153284, + "grad_norm": 0.13185956190100087, + "learning_rate": 1.2350766955784688e-06, + "loss": 0.7211, + "step": 5395 + }, + { + "epoch": 2.6771751272185678, + "grad_norm": 0.13191116919803744, + "learning_rate": 1.2342337778967383e-06, + "loss": 0.6744, + "step": 5396 + }, + { + "epoch": 2.677671589921807, + "grad_norm": 0.12866861106797114, + "learning_rate": 1.2333910536670818e-06, + "loss": 0.7133, + "step": 5397 + }, + { + "epoch": 2.6781680526250464, + "grad_norm": 0.13443812133789382, + "learning_rate": 1.2325485230182923e-06, + "loss": 0.6642, + "step": 5398 + }, + { + "epoch": 2.678664515328286, + "grad_norm": 0.13194917442407558, + "learning_rate": 1.2317061860791402e-06, + "loss": 0.7635, + "step": 5399 + }, + { + "epoch": 2.6791609780315255, + "grad_norm": 0.12949501549978698, + "learning_rate": 1.230864042978361e-06, + "loss": 0.708, + "step": 5400 + }, + { + "epoch": 2.679657440734765, + "grad_norm": 0.12914002116089188, + "learning_rate": 1.230022093844664e-06, + "loss": 0.6673, + "step": 5401 + }, + { + "epoch": 2.680153903438004, + "grad_norm": 0.1288450806713303, + "learning_rate": 1.2291803388067284e-06, + "loss": 0.6895, + "step": 5402 + }, + { + "epoch": 2.6806503661412435, + "grad_norm": 0.13187999417119586, + "learning_rate": 1.2283387779932005e-06, + "loss": 0.7022, + "step": 5403 + }, + { + "epoch": 2.681146828844483, + "grad_norm": 0.12707167814369877, + "learning_rate": 1.2274974115327017e-06, + "loss": 0.7049, + "step": 5404 + }, + { + "epoch": 2.6816432915477226, + "grad_norm": 0.12903769384971894, + "learning_rate": 1.2266562395538198e-06, + "loss": 0.7194, + "step": 5405 + }, + { + "epoch": 2.682139754250962, + "grad_norm": 0.13367318620502683, + "learning_rate": 1.225815262185116e-06, + "loss": 0.6971, + "step": 5406 + }, + { + "epoch": 2.6826362169542013, + "grad_norm": 0.12972052003617227, + "learning_rate": 1.2249744795551198e-06, + "loss": 0.7434, + "step": 5407 + }, + { + "epoch": 2.6831326796574406, + "grad_norm": 0.1270398790488789, + "learning_rate": 1.2241338917923295e-06, + "loss": 0.6903, + "step": 5408 + }, + { + "epoch": 2.6836291423606804, + "grad_norm": 0.1273000126272403, + "learning_rate": 1.223293499025218e-06, + "loss": 0.6947, + "step": 5409 + }, + { + "epoch": 2.6841256050639197, + "grad_norm": 0.12789714754456988, + "learning_rate": 1.2224533013822237e-06, + "loss": 0.7394, + "step": 5410 + }, + { + "epoch": 2.684622067767159, + "grad_norm": 0.1284746698224543, + "learning_rate": 1.2216132989917592e-06, + "loss": 0.6935, + "step": 5411 + }, + { + "epoch": 2.6851185304703984, + "grad_norm": 0.128337900001136, + "learning_rate": 1.2207734919822047e-06, + "loss": 0.6772, + "step": 5412 + }, + { + "epoch": 2.6856149931736377, + "grad_norm": 0.1286919735265176, + "learning_rate": 1.2199338804819114e-06, + "loss": 0.696, + "step": 5413 + }, + { + "epoch": 2.686111455876877, + "grad_norm": 0.12513034231984738, + "learning_rate": 1.2190944646191999e-06, + "loss": 0.6521, + "step": 5414 + }, + { + "epoch": 2.686607918580117, + "grad_norm": 0.1371942316709031, + "learning_rate": 1.2182552445223609e-06, + "loss": 0.7441, + "step": 5415 + }, + { + "epoch": 2.687104381283356, + "grad_norm": 0.12633983584561082, + "learning_rate": 1.2174162203196575e-06, + "loss": 0.6826, + "step": 5416 + }, + { + "epoch": 2.6876008439865955, + "grad_norm": 0.1321860232959971, + "learning_rate": 1.216577392139319e-06, + "loss": 0.7607, + "step": 5417 + }, + { + "epoch": 2.688097306689835, + "grad_norm": 0.12852572536801632, + "learning_rate": 1.2157387601095492e-06, + "loss": 0.692, + "step": 5418 + }, + { + "epoch": 2.6885937693930746, + "grad_norm": 0.12900473130966916, + "learning_rate": 1.214900324358518e-06, + "loss": 0.7091, + "step": 5419 + }, + { + "epoch": 2.689090232096314, + "grad_norm": 0.12971154741512667, + "learning_rate": 1.2140620850143667e-06, + "loss": 0.7053, + "step": 5420 + }, + { + "epoch": 2.6895866947995533, + "grad_norm": 0.13521129476773464, + "learning_rate": 1.213224042205208e-06, + "loss": 0.7227, + "step": 5421 + }, + { + "epoch": 2.6900831575027926, + "grad_norm": 0.13226811800646973, + "learning_rate": 1.2123861960591224e-06, + "loss": 0.7147, + "step": 5422 + }, + { + "epoch": 2.690579620206032, + "grad_norm": 0.131271672560679, + "learning_rate": 1.2115485467041608e-06, + "loss": 0.7435, + "step": 5423 + }, + { + "epoch": 2.6910760829092712, + "grad_norm": 0.12939692136576958, + "learning_rate": 1.2107110942683459e-06, + "loss": 0.6961, + "step": 5424 + }, + { + "epoch": 2.691572545612511, + "grad_norm": 0.13543860120642587, + "learning_rate": 1.2098738388796668e-06, + "loss": 0.6976, + "step": 5425 + }, + { + "epoch": 2.6920690083157504, + "grad_norm": 0.1270996570143555, + "learning_rate": 1.2090367806660872e-06, + "loss": 0.693, + "step": 5426 + }, + { + "epoch": 2.6925654710189897, + "grad_norm": 0.13061886754156452, + "learning_rate": 1.2081999197555366e-06, + "loss": 0.6917, + "step": 5427 + }, + { + "epoch": 2.693061933722229, + "grad_norm": 0.13261693716322412, + "learning_rate": 1.2073632562759146e-06, + "loss": 0.7223, + "step": 5428 + }, + { + "epoch": 2.693558396425469, + "grad_norm": 0.12738184225408514, + "learning_rate": 1.2065267903550953e-06, + "loss": 0.6637, + "step": 5429 + }, + { + "epoch": 2.694054859128708, + "grad_norm": 0.12686598161535245, + "learning_rate": 1.2056905221209147e-06, + "loss": 0.6801, + "step": 5430 + }, + { + "epoch": 2.6945513218319475, + "grad_norm": 0.12651761385982638, + "learning_rate": 1.2048544517011863e-06, + "loss": 0.7168, + "step": 5431 + }, + { + "epoch": 2.695047784535187, + "grad_norm": 0.12731010103172005, + "learning_rate": 1.2040185792236874e-06, + "loss": 0.6676, + "step": 5432 + }, + { + "epoch": 2.695544247238426, + "grad_norm": 0.12581732529546605, + "learning_rate": 1.2031829048161705e-06, + "loss": 0.6796, + "step": 5433 + }, + { + "epoch": 2.6960407099416654, + "grad_norm": 0.13000955582734638, + "learning_rate": 1.2023474286063538e-06, + "loss": 0.708, + "step": 5434 + }, + { + "epoch": 2.696537172644905, + "grad_norm": 0.1320803282519354, + "learning_rate": 1.2015121507219254e-06, + "loss": 0.6949, + "step": 5435 + }, + { + "epoch": 2.6970336353481446, + "grad_norm": 0.1317763220856854, + "learning_rate": 1.2006770712905458e-06, + "loss": 0.6895, + "step": 5436 + }, + { + "epoch": 2.697530098051384, + "grad_norm": 0.12990531826150573, + "learning_rate": 1.1998421904398423e-06, + "loss": 0.7198, + "step": 5437 + }, + { + "epoch": 2.698026560754623, + "grad_norm": 0.1383883572643517, + "learning_rate": 1.199007508297414e-06, + "loss": 0.7659, + "step": 5438 + }, + { + "epoch": 2.698523023457863, + "grad_norm": 0.12440590803020435, + "learning_rate": 1.1981730249908282e-06, + "loss": 0.6478, + "step": 5439 + }, + { + "epoch": 2.6990194861611023, + "grad_norm": 0.12784078674498142, + "learning_rate": 1.1973387406476216e-06, + "loss": 0.6741, + "step": 5440 + }, + { + "epoch": 2.6995159488643417, + "grad_norm": 0.13239096818217247, + "learning_rate": 1.196504655395303e-06, + "loss": 0.7346, + "step": 5441 + }, + { + "epoch": 2.700012411567581, + "grad_norm": 0.12753915589030973, + "learning_rate": 1.1956707693613468e-06, + "loss": 0.6775, + "step": 5442 + }, + { + "epoch": 2.7005088742708203, + "grad_norm": 0.12968903089581899, + "learning_rate": 1.194837082673201e-06, + "loss": 0.7184, + "step": 5443 + }, + { + "epoch": 2.7010053369740596, + "grad_norm": 0.12351868593377198, + "learning_rate": 1.1940035954582803e-06, + "loss": 0.7085, + "step": 5444 + }, + { + "epoch": 2.701501799677299, + "grad_norm": 0.13181069411223056, + "learning_rate": 1.1931703078439705e-06, + "loss": 0.7328, + "step": 5445 + }, + { + "epoch": 2.7019982623805388, + "grad_norm": 0.1330437926631701, + "learning_rate": 1.1923372199576252e-06, + "loss": 0.7267, + "step": 5446 + }, + { + "epoch": 2.702494725083778, + "grad_norm": 0.12415515084900948, + "learning_rate": 1.1915043319265684e-06, + "loss": 0.6483, + "step": 5447 + }, + { + "epoch": 2.7029911877870174, + "grad_norm": 0.12773854918997632, + "learning_rate": 1.1906716438780952e-06, + "loss": 0.6881, + "step": 5448 + }, + { + "epoch": 2.703487650490257, + "grad_norm": 0.13125807339891024, + "learning_rate": 1.1898391559394668e-06, + "loss": 0.708, + "step": 5449 + }, + { + "epoch": 2.7039841131934965, + "grad_norm": 0.12144720415007257, + "learning_rate": 1.1890068682379175e-06, + "loss": 0.6287, + "step": 5450 + }, + { + "epoch": 2.704480575896736, + "grad_norm": 0.13122189521060085, + "learning_rate": 1.1881747809006483e-06, + "loss": 0.7784, + "step": 5451 + }, + { + "epoch": 2.704977038599975, + "grad_norm": 0.12593554383340527, + "learning_rate": 1.1873428940548293e-06, + "loss": 0.7264, + "step": 5452 + }, + { + "epoch": 2.7054735013032145, + "grad_norm": 0.13034835041902, + "learning_rate": 1.1865112078276032e-06, + "loss": 0.6816, + "step": 5453 + }, + { + "epoch": 2.705969964006454, + "grad_norm": 0.12793803176190882, + "learning_rate": 1.1856797223460776e-06, + "loss": 0.6852, + "step": 5454 + }, + { + "epoch": 2.706466426709693, + "grad_norm": 0.13001908847017454, + "learning_rate": 1.1848484377373336e-06, + "loss": 0.7136, + "step": 5455 + }, + { + "epoch": 2.706962889412933, + "grad_norm": 0.1244529234238611, + "learning_rate": 1.1840173541284198e-06, + "loss": 0.6583, + "step": 5456 + }, + { + "epoch": 2.7074593521161723, + "grad_norm": 0.12496088308943552, + "learning_rate": 1.1831864716463517e-06, + "loss": 0.6996, + "step": 5457 + }, + { + "epoch": 2.7079558148194116, + "grad_norm": 0.13009009109117492, + "learning_rate": 1.182355790418119e-06, + "loss": 0.7313, + "step": 5458 + }, + { + "epoch": 2.7084522775226514, + "grad_norm": 0.13302558329128752, + "learning_rate": 1.181525310570677e-06, + "loss": 0.7344, + "step": 5459 + }, + { + "epoch": 2.7089487402258907, + "grad_norm": 0.13255724713593725, + "learning_rate": 1.1806950322309503e-06, + "loss": 0.7073, + "step": 5460 + }, + { + "epoch": 2.70944520292913, + "grad_norm": 0.12830942629616623, + "learning_rate": 1.1798649555258359e-06, + "loss": 0.6774, + "step": 5461 + }, + { + "epoch": 2.7099416656323694, + "grad_norm": 0.13135455282779482, + "learning_rate": 1.1790350805821948e-06, + "loss": 0.6984, + "step": 5462 + }, + { + "epoch": 2.7104381283356087, + "grad_norm": 0.12869925272920885, + "learning_rate": 1.1782054075268626e-06, + "loss": 0.6924, + "step": 5463 + }, + { + "epoch": 2.710934591038848, + "grad_norm": 0.12635830105891932, + "learning_rate": 1.1773759364866394e-06, + "loss": 0.6704, + "step": 5464 + }, + { + "epoch": 2.7114310537420874, + "grad_norm": 0.1267057090577105, + "learning_rate": 1.1765466675882983e-06, + "loss": 0.6603, + "step": 5465 + }, + { + "epoch": 2.711927516445327, + "grad_norm": 0.1379598054720685, + "learning_rate": 1.1757176009585795e-06, + "loss": 0.7162, + "step": 5466 + }, + { + "epoch": 2.7124239791485665, + "grad_norm": 0.1267327103751104, + "learning_rate": 1.1748887367241913e-06, + "loss": 0.6841, + "step": 5467 + }, + { + "epoch": 2.712920441851806, + "grad_norm": 0.12581262787382652, + "learning_rate": 1.1740600750118136e-06, + "loss": 0.7009, + "step": 5468 + }, + { + "epoch": 2.713416904555045, + "grad_norm": 0.12485464413297796, + "learning_rate": 1.173231615948093e-06, + "loss": 0.6748, + "step": 5469 + }, + { + "epoch": 2.713913367258285, + "grad_norm": 0.12620028737405145, + "learning_rate": 1.1724033596596477e-06, + "loss": 0.688, + "step": 5470 + }, + { + "epoch": 2.7144098299615242, + "grad_norm": 0.12721872175292415, + "learning_rate": 1.1715753062730622e-06, + "loss": 0.7245, + "step": 5471 + }, + { + "epoch": 2.7149062926647636, + "grad_norm": 0.13289322932822414, + "learning_rate": 1.170747455914891e-06, + "loss": 0.7097, + "step": 5472 + }, + { + "epoch": 2.715402755368003, + "grad_norm": 0.12728705232851031, + "learning_rate": 1.169919808711659e-06, + "loss": 0.6834, + "step": 5473 + }, + { + "epoch": 2.7158992180712422, + "grad_norm": 0.1267143167247784, + "learning_rate": 1.169092364789857e-06, + "loss": 0.6926, + "step": 5474 + }, + { + "epoch": 2.7163956807744816, + "grad_norm": 0.13082421386045198, + "learning_rate": 1.1682651242759483e-06, + "loss": 0.7155, + "step": 5475 + }, + { + "epoch": 2.7168921434777213, + "grad_norm": 0.1313656554797384, + "learning_rate": 1.1674380872963629e-06, + "loss": 0.689, + "step": 5476 + }, + { + "epoch": 2.7173886061809607, + "grad_norm": 0.12776895465581478, + "learning_rate": 1.1666112539774998e-06, + "loss": 0.6669, + "step": 5477 + }, + { + "epoch": 2.7178850688842, + "grad_norm": 0.12847066772569427, + "learning_rate": 1.1657846244457272e-06, + "loss": 0.7357, + "step": 5478 + }, + { + "epoch": 2.7183815315874393, + "grad_norm": 0.13314280685450303, + "learning_rate": 1.1649581988273814e-06, + "loss": 0.7503, + "step": 5479 + }, + { + "epoch": 2.718877994290679, + "grad_norm": 0.13052451777587085, + "learning_rate": 1.16413197724877e-06, + "loss": 0.7349, + "step": 5480 + }, + { + "epoch": 2.7193744569939184, + "grad_norm": 0.1300407125579856, + "learning_rate": 1.163305959836166e-06, + "loss": 0.7083, + "step": 5481 + }, + { + "epoch": 2.7198709196971578, + "grad_norm": 0.1280550112986663, + "learning_rate": 1.1624801467158145e-06, + "loss": 0.6762, + "step": 5482 + }, + { + "epoch": 2.720367382400397, + "grad_norm": 0.12510160831325617, + "learning_rate": 1.1616545380139272e-06, + "loss": 0.6925, + "step": 5483 + }, + { + "epoch": 2.7208638451036364, + "grad_norm": 0.13196997248908202, + "learning_rate": 1.1608291338566841e-06, + "loss": 0.7083, + "step": 5484 + }, + { + "epoch": 2.7213603078068758, + "grad_norm": 0.1340590026668367, + "learning_rate": 1.1600039343702368e-06, + "loss": 0.6946, + "step": 5485 + }, + { + "epoch": 2.7218567705101155, + "grad_norm": 0.12829686594605474, + "learning_rate": 1.1591789396807021e-06, + "loss": 0.705, + "step": 5486 + }, + { + "epoch": 2.722353233213355, + "grad_norm": 0.12881800317843775, + "learning_rate": 1.158354149914169e-06, + "loss": 0.6883, + "step": 5487 + }, + { + "epoch": 2.722849695916594, + "grad_norm": 0.12960535888237937, + "learning_rate": 1.1575295651966926e-06, + "loss": 0.6959, + "step": 5488 + }, + { + "epoch": 2.7233461586198335, + "grad_norm": 0.12426299232658077, + "learning_rate": 1.156705185654296e-06, + "loss": 0.679, + "step": 5489 + }, + { + "epoch": 2.7238426213230733, + "grad_norm": 0.1267642649577807, + "learning_rate": 1.1558810114129746e-06, + "loss": 0.6957, + "step": 5490 + }, + { + "epoch": 2.7243390840263126, + "grad_norm": 0.13057816997613797, + "learning_rate": 1.1550570425986884e-06, + "loss": 0.6982, + "step": 5491 + }, + { + "epoch": 2.724835546729552, + "grad_norm": 0.13036966095415778, + "learning_rate": 1.1542332793373699e-06, + "loss": 0.7108, + "step": 5492 + }, + { + "epoch": 2.7253320094327913, + "grad_norm": 0.12845031666688522, + "learning_rate": 1.1534097217549167e-06, + "loss": 0.7045, + "step": 5493 + }, + { + "epoch": 2.7258284721360306, + "grad_norm": 0.13431116362677278, + "learning_rate": 1.1525863699771967e-06, + "loss": 0.7211, + "step": 5494 + }, + { + "epoch": 2.72632493483927, + "grad_norm": 0.12701692487109137, + "learning_rate": 1.1517632241300457e-06, + "loss": 0.6845, + "step": 5495 + }, + { + "epoch": 2.7268213975425097, + "grad_norm": 0.12491394627771943, + "learning_rate": 1.1509402843392681e-06, + "loss": 0.658, + "step": 5496 + }, + { + "epoch": 2.727317860245749, + "grad_norm": 0.12829490733233098, + "learning_rate": 1.150117550730638e-06, + "loss": 0.7134, + "step": 5497 + }, + { + "epoch": 2.7278143229489884, + "grad_norm": 0.127816041512245, + "learning_rate": 1.1492950234298965e-06, + "loss": 0.6658, + "step": 5498 + }, + { + "epoch": 2.7283107856522277, + "grad_norm": 0.12694117469603244, + "learning_rate": 1.1484727025627532e-06, + "loss": 0.7073, + "step": 5499 + }, + { + "epoch": 2.7288072483554675, + "grad_norm": 0.1273683408910965, + "learning_rate": 1.147650588254888e-06, + "loss": 0.7044, + "step": 5500 + }, + { + "epoch": 2.729303711058707, + "grad_norm": 0.12716535637041657, + "learning_rate": 1.1468286806319461e-06, + "loss": 0.7027, + "step": 5501 + }, + { + "epoch": 2.729800173761946, + "grad_norm": 0.13652878890308245, + "learning_rate": 1.146006979819545e-06, + "loss": 0.7242, + "step": 5502 + }, + { + "epoch": 2.7302966364651855, + "grad_norm": 0.13060886134753424, + "learning_rate": 1.1451854859432674e-06, + "loss": 0.6823, + "step": 5503 + }, + { + "epoch": 2.730793099168425, + "grad_norm": 0.12949358227740704, + "learning_rate": 1.1443641991286644e-06, + "loss": 0.6812, + "step": 5504 + }, + { + "epoch": 2.731289561871664, + "grad_norm": 0.1484208011714529, + "learning_rate": 1.1435431195012586e-06, + "loss": 0.7169, + "step": 5505 + }, + { + "epoch": 2.731786024574904, + "grad_norm": 0.12279528726656044, + "learning_rate": 1.1427222471865368e-06, + "loss": 0.6499, + "step": 5506 + }, + { + "epoch": 2.7322824872781433, + "grad_norm": 0.12686401827050103, + "learning_rate": 1.1419015823099582e-06, + "loss": 0.6969, + "step": 5507 + }, + { + "epoch": 2.7327789499813826, + "grad_norm": 0.1304912606677775, + "learning_rate": 1.1410811249969475e-06, + "loss": 0.7185, + "step": 5508 + }, + { + "epoch": 2.733275412684622, + "grad_norm": 0.1307385183998749, + "learning_rate": 1.140260875372898e-06, + "loss": 0.7018, + "step": 5509 + }, + { + "epoch": 2.7337718753878617, + "grad_norm": 0.12879059404077767, + "learning_rate": 1.1394408335631721e-06, + "loss": 0.7036, + "step": 5510 + }, + { + "epoch": 2.734268338091101, + "grad_norm": 0.13168818270683702, + "learning_rate": 1.138620999693099e-06, + "loss": 0.733, + "step": 5511 + }, + { + "epoch": 2.7347648007943404, + "grad_norm": 0.1302130132221126, + "learning_rate": 1.1378013738879787e-06, + "loss": 0.7118, + "step": 5512 + }, + { + "epoch": 2.7352612634975797, + "grad_norm": 0.12664057418388594, + "learning_rate": 1.1369819562730763e-06, + "loss": 0.692, + "step": 5513 + }, + { + "epoch": 2.735757726200819, + "grad_norm": 0.1787042284319172, + "learning_rate": 1.1361627469736286e-06, + "loss": 0.6932, + "step": 5514 + }, + { + "epoch": 2.7362541889040584, + "grad_norm": 0.13276835251453653, + "learning_rate": 1.1353437461148378e-06, + "loss": 0.685, + "step": 5515 + }, + { + "epoch": 2.736750651607298, + "grad_norm": 0.12840095500860307, + "learning_rate": 1.1345249538218736e-06, + "loss": 0.6855, + "step": 5516 + }, + { + "epoch": 2.7372471143105375, + "grad_norm": 0.13118594667050903, + "learning_rate": 1.1337063702198775e-06, + "loss": 0.7603, + "step": 5517 + }, + { + "epoch": 2.737743577013777, + "grad_norm": 0.12576209372358757, + "learning_rate": 1.1328879954339546e-06, + "loss": 0.6698, + "step": 5518 + }, + { + "epoch": 2.738240039717016, + "grad_norm": 0.133093950748946, + "learning_rate": 1.132069829589183e-06, + "loss": 0.6886, + "step": 5519 + }, + { + "epoch": 2.738736502420256, + "grad_norm": 0.12864037736931044, + "learning_rate": 1.1312518728106048e-06, + "loss": 0.6766, + "step": 5520 + }, + { + "epoch": 2.7392329651234952, + "grad_norm": 0.12945382840659705, + "learning_rate": 1.1304341252232307e-06, + "loss": 0.6828, + "step": 5521 + }, + { + "epoch": 2.7397294278267346, + "grad_norm": 0.1308218826995222, + "learning_rate": 1.129616586952042e-06, + "loss": 0.6656, + "step": 5522 + }, + { + "epoch": 2.740225890529974, + "grad_norm": 0.1314567316421182, + "learning_rate": 1.1287992581219846e-06, + "loss": 0.7036, + "step": 5523 + }, + { + "epoch": 2.7407223532332132, + "grad_norm": 0.1366700099747632, + "learning_rate": 1.1279821388579762e-06, + "loss": 0.7349, + "step": 5524 + }, + { + "epoch": 2.7412188159364526, + "grad_norm": 0.1309681880536747, + "learning_rate": 1.1271652292848988e-06, + "loss": 0.6795, + "step": 5525 + }, + { + "epoch": 2.7417152786396923, + "grad_norm": 0.12837208221450544, + "learning_rate": 1.126348529527605e-06, + "loss": 0.6809, + "step": 5526 + }, + { + "epoch": 2.7422117413429317, + "grad_norm": 0.131214461157075, + "learning_rate": 1.1255320397109132e-06, + "loss": 0.7304, + "step": 5527 + }, + { + "epoch": 2.742708204046171, + "grad_norm": 0.12712062614786912, + "learning_rate": 1.1247157599596103e-06, + "loss": 0.6985, + "step": 5528 + }, + { + "epoch": 2.7432046667494103, + "grad_norm": 0.13592701398261714, + "learning_rate": 1.1238996903984537e-06, + "loss": 0.763, + "step": 5529 + }, + { + "epoch": 2.74370112945265, + "grad_norm": 0.12718223517036215, + "learning_rate": 1.1230838311521642e-06, + "loss": 0.6984, + "step": 5530 + }, + { + "epoch": 2.7441975921558894, + "grad_norm": 0.13534565978463103, + "learning_rate": 1.1222681823454349e-06, + "loss": 0.7765, + "step": 5531 + }, + { + "epoch": 2.7446940548591288, + "grad_norm": 0.13626701847329473, + "learning_rate": 1.121452744102924e-06, + "loss": 0.7439, + "step": 5532 + }, + { + "epoch": 2.745190517562368, + "grad_norm": 0.13284297253885471, + "learning_rate": 1.1206375165492564e-06, + "loss": 0.7347, + "step": 5533 + }, + { + "epoch": 2.7456869802656074, + "grad_norm": 0.13079426764125932, + "learning_rate": 1.119822499809029e-06, + "loss": 0.6999, + "step": 5534 + }, + { + "epoch": 2.7461834429688468, + "grad_norm": 0.12570608638802389, + "learning_rate": 1.1190076940068031e-06, + "loss": 0.6831, + "step": 5535 + }, + { + "epoch": 2.7466799056720865, + "grad_norm": 0.12564077502899773, + "learning_rate": 1.1181930992671078e-06, + "loss": 0.6883, + "step": 5536 + }, + { + "epoch": 2.747176368375326, + "grad_norm": 0.12497737842928656, + "learning_rate": 1.1173787157144425e-06, + "loss": 0.6551, + "step": 5537 + }, + { + "epoch": 2.747672831078565, + "grad_norm": 0.13189163059908987, + "learning_rate": 1.116564543473271e-06, + "loss": 0.7169, + "step": 5538 + }, + { + "epoch": 2.7481692937818045, + "grad_norm": 0.13114812942001947, + "learning_rate": 1.1157505826680285e-06, + "loss": 0.6959, + "step": 5539 + }, + { + "epoch": 2.7486657564850443, + "grad_norm": 0.13908344915755239, + "learning_rate": 1.1149368334231146e-06, + "loss": 0.7409, + "step": 5540 + }, + { + "epoch": 2.7491622191882836, + "grad_norm": 0.13448810312163784, + "learning_rate": 1.1141232958628976e-06, + "loss": 0.7048, + "step": 5541 + }, + { + "epoch": 2.749658681891523, + "grad_norm": 0.13514802207833446, + "learning_rate": 1.1133099701117143e-06, + "loss": 0.7112, + "step": 5542 + }, + { + "epoch": 2.7501551445947623, + "grad_norm": 0.12623353087443057, + "learning_rate": 1.112496856293867e-06, + "loss": 0.7261, + "step": 5543 + }, + { + "epoch": 2.7506516072980016, + "grad_norm": 0.12458653015367169, + "learning_rate": 1.1116839545336292e-06, + "loss": 0.6775, + "step": 5544 + }, + { + "epoch": 2.7506516072980016, + "eval_loss": 0.7236723899841309, + "eval_runtime": 135.7629, + "eval_samples_per_second": 223.574, + "eval_steps_per_second": 27.953, + "step": 5544 + }, + { + "epoch": 2.751148070001241, + "grad_norm": 0.1306204466783865, + "learning_rate": 1.1108712649552384e-06, + "loss": 0.6945, + "step": 5545 + }, + { + "epoch": 2.7516445327044807, + "grad_norm": 0.12369576532652772, + "learning_rate": 1.1100587876829024e-06, + "loss": 0.6289, + "step": 5546 + }, + { + "epoch": 2.75214099540772, + "grad_norm": 0.12789165154828827, + "learning_rate": 1.1092465228407949e-06, + "loss": 0.6195, + "step": 5547 + }, + { + "epoch": 2.7526374581109594, + "grad_norm": 0.1266994829459876, + "learning_rate": 1.1084344705530561e-06, + "loss": 0.7413, + "step": 5548 + }, + { + "epoch": 2.7531339208141987, + "grad_norm": 0.1268540066373866, + "learning_rate": 1.1076226309437977e-06, + "loss": 0.6826, + "step": 5549 + }, + { + "epoch": 2.7536303835174385, + "grad_norm": 0.12618417321246705, + "learning_rate": 1.1068110041370938e-06, + "loss": 0.6762, + "step": 5550 + }, + { + "epoch": 2.754126846220678, + "grad_norm": 0.1325579883265857, + "learning_rate": 1.1059995902569911e-06, + "loss": 0.7399, + "step": 5551 + }, + { + "epoch": 2.754623308923917, + "grad_norm": 0.12889480405547812, + "learning_rate": 1.1051883894274998e-06, + "loss": 0.7098, + "step": 5552 + }, + { + "epoch": 2.7551197716271565, + "grad_norm": 0.135934126639656, + "learning_rate": 1.104377401772598e-06, + "loss": 0.7098, + "step": 5553 + }, + { + "epoch": 2.755616234330396, + "grad_norm": 0.1289564414190799, + "learning_rate": 1.1035666274162344e-06, + "loss": 0.7435, + "step": 5554 + }, + { + "epoch": 2.756112697033635, + "grad_norm": 0.12736214807761737, + "learning_rate": 1.1027560664823208e-06, + "loss": 0.6656, + "step": 5555 + }, + { + "epoch": 2.756609159736875, + "grad_norm": 0.12848892854207852, + "learning_rate": 1.10194571909474e-06, + "loss": 0.7014, + "step": 5556 + }, + { + "epoch": 2.7571056224401143, + "grad_norm": 0.1321006717534794, + "learning_rate": 1.10113558537734e-06, + "loss": 0.7015, + "step": 5557 + }, + { + "epoch": 2.7576020851433536, + "grad_norm": 0.12854999523007427, + "learning_rate": 1.100325665453937e-06, + "loss": 0.6767, + "step": 5558 + }, + { + "epoch": 2.758098547846593, + "grad_norm": 0.1279414734388252, + "learning_rate": 1.0995159594483138e-06, + "loss": 0.696, + "step": 5559 + }, + { + "epoch": 2.7585950105498327, + "grad_norm": 0.13005801919260618, + "learning_rate": 1.09870646748422e-06, + "loss": 0.73, + "step": 5560 + }, + { + "epoch": 2.759091473253072, + "grad_norm": 0.1360761488502302, + "learning_rate": 1.0978971896853758e-06, + "loss": 0.7399, + "step": 5561 + }, + { + "epoch": 2.7595879359563114, + "grad_norm": 0.13408584165708903, + "learning_rate": 1.0970881261754641e-06, + "loss": 0.7369, + "step": 5562 + }, + { + "epoch": 2.7600843986595507, + "grad_norm": 0.12871987297525, + "learning_rate": 1.096279277078139e-06, + "loss": 0.7009, + "step": 5563 + }, + { + "epoch": 2.76058086136279, + "grad_norm": 0.12990364606695212, + "learning_rate": 1.0954706425170198e-06, + "loss": 0.6468, + "step": 5564 + }, + { + "epoch": 2.7610773240660293, + "grad_norm": 0.1252919783584692, + "learning_rate": 1.094662222615692e-06, + "loss": 0.7253, + "step": 5565 + }, + { + "epoch": 2.761573786769269, + "grad_norm": 0.1326764590455894, + "learning_rate": 1.0938540174977115e-06, + "loss": 0.7167, + "step": 5566 + }, + { + "epoch": 2.7620702494725085, + "grad_norm": 0.13247387675260933, + "learning_rate": 1.0930460272865976e-06, + "loss": 0.7151, + "step": 5567 + }, + { + "epoch": 2.762566712175748, + "grad_norm": 0.13190057787514423, + "learning_rate": 1.0922382521058405e-06, + "loss": 0.7643, + "step": 5568 + }, + { + "epoch": 2.763063174878987, + "grad_norm": 0.12934627161113424, + "learning_rate": 1.091430692078895e-06, + "loss": 0.7093, + "step": 5569 + }, + { + "epoch": 2.763559637582227, + "grad_norm": 0.1265845815519934, + "learning_rate": 1.0906233473291827e-06, + "loss": 0.6742, + "step": 5570 + }, + { + "epoch": 2.764056100285466, + "grad_norm": 0.1344552360678481, + "learning_rate": 1.0898162179800948e-06, + "loss": 0.7388, + "step": 5571 + }, + { + "epoch": 2.7645525629887056, + "grad_norm": 0.12498891326605503, + "learning_rate": 1.0890093041549873e-06, + "loss": 0.6614, + "step": 5572 + }, + { + "epoch": 2.765049025691945, + "grad_norm": 0.1289570046975174, + "learning_rate": 1.0882026059771845e-06, + "loss": 0.7288, + "step": 5573 + }, + { + "epoch": 2.765545488395184, + "grad_norm": 0.12524071202980924, + "learning_rate": 1.0873961235699759e-06, + "loss": 0.6666, + "step": 5574 + }, + { + "epoch": 2.7660419510984235, + "grad_norm": 0.13322599761967555, + "learning_rate": 1.0865898570566212e-06, + "loss": 0.7645, + "step": 5575 + }, + { + "epoch": 2.766538413801663, + "grad_norm": 0.13519871420996185, + "learning_rate": 1.0857838065603447e-06, + "loss": 0.7605, + "step": 5576 + }, + { + "epoch": 2.7670348765049027, + "grad_norm": 0.13070973410690823, + "learning_rate": 1.084977972204337e-06, + "loss": 0.6832, + "step": 5577 + }, + { + "epoch": 2.767531339208142, + "grad_norm": 0.13143077641170847, + "learning_rate": 1.0841723541117594e-06, + "loss": 0.7178, + "step": 5578 + }, + { + "epoch": 2.7680278019113813, + "grad_norm": 0.13523115464259342, + "learning_rate": 1.083366952405736e-06, + "loss": 0.7738, + "step": 5579 + }, + { + "epoch": 2.768524264614621, + "grad_norm": 0.13073847691026474, + "learning_rate": 1.0825617672093592e-06, + "loss": 0.7124, + "step": 5580 + }, + { + "epoch": 2.7690207273178604, + "grad_norm": 0.12972826652950495, + "learning_rate": 1.0817567986456904e-06, + "loss": 0.6924, + "step": 5581 + }, + { + "epoch": 2.7695171900210998, + "grad_norm": 0.13008197120598594, + "learning_rate": 1.0809520468377541e-06, + "loss": 0.7238, + "step": 5582 + }, + { + "epoch": 2.770013652724339, + "grad_norm": 0.1297808815362704, + "learning_rate": 1.0801475119085455e-06, + "loss": 0.7345, + "step": 5583 + }, + { + "epoch": 2.7705101154275784, + "grad_norm": 0.13507574439437295, + "learning_rate": 1.0793431939810243e-06, + "loss": 0.7121, + "step": 5584 + }, + { + "epoch": 2.7710065781308177, + "grad_norm": 0.1273641822305587, + "learning_rate": 1.0785390931781164e-06, + "loss": 0.6972, + "step": 5585 + }, + { + "epoch": 2.771503040834057, + "grad_norm": 0.13271462621019983, + "learning_rate": 1.0777352096227174e-06, + "loss": 0.7522, + "step": 5586 + }, + { + "epoch": 2.771999503537297, + "grad_norm": 0.13302101809288855, + "learning_rate": 1.076931543437687e-06, + "loss": 0.7542, + "step": 5587 + }, + { + "epoch": 2.772495966240536, + "grad_norm": 0.12899005010841755, + "learning_rate": 1.0761280947458536e-06, + "loss": 0.6468, + "step": 5588 + }, + { + "epoch": 2.7729924289437755, + "grad_norm": 0.12421829548371563, + "learning_rate": 1.0753248636700109e-06, + "loss": 0.6886, + "step": 5589 + }, + { + "epoch": 2.7734888916470153, + "grad_norm": 0.12349016996371331, + "learning_rate": 1.0745218503329196e-06, + "loss": 0.7158, + "step": 5590 + }, + { + "epoch": 2.7739853543502546, + "grad_norm": 0.12644575429599572, + "learning_rate": 1.0737190548573082e-06, + "loss": 0.6762, + "step": 5591 + }, + { + "epoch": 2.774481817053494, + "grad_norm": 0.134593415267307, + "learning_rate": 1.0729164773658692e-06, + "loss": 0.7065, + "step": 5592 + }, + { + "epoch": 2.7749782797567333, + "grad_norm": 0.1260483274577327, + "learning_rate": 1.0721141179812664e-06, + "loss": 0.6827, + "step": 5593 + }, + { + "epoch": 2.7754747424599726, + "grad_norm": 0.12812302335606984, + "learning_rate": 1.0713119768261248e-06, + "loss": 0.6981, + "step": 5594 + }, + { + "epoch": 2.775971205163212, + "grad_norm": 0.12876922838504537, + "learning_rate": 1.0705100540230418e-06, + "loss": 0.7289, + "step": 5595 + }, + { + "epoch": 2.7764676678664513, + "grad_norm": 0.12571235400921374, + "learning_rate": 1.0697083496945766e-06, + "loss": 0.6929, + "step": 5596 + }, + { + "epoch": 2.776964130569691, + "grad_norm": 0.13199476837674784, + "learning_rate": 1.0689068639632563e-06, + "loss": 0.7835, + "step": 5597 + }, + { + "epoch": 2.7774605932729304, + "grad_norm": 0.1273711543359699, + "learning_rate": 1.0681055969515769e-06, + "loss": 0.7014, + "step": 5598 + }, + { + "epoch": 2.7779570559761697, + "grad_norm": 0.1269043028906525, + "learning_rate": 1.0673045487819975e-06, + "loss": 0.69, + "step": 5599 + }, + { + "epoch": 2.7784535186794095, + "grad_norm": 0.1319110742532422, + "learning_rate": 1.066503719576947e-06, + "loss": 0.7461, + "step": 5600 + }, + { + "epoch": 2.778949981382649, + "grad_norm": 0.12579895622801868, + "learning_rate": 1.0657031094588184e-06, + "loss": 0.6868, + "step": 5601 + }, + { + "epoch": 2.779446444085888, + "grad_norm": 0.12882226269354222, + "learning_rate": 1.064902718549972e-06, + "loss": 0.7045, + "step": 5602 + }, + { + "epoch": 2.7799429067891275, + "grad_norm": 0.1270352468548072, + "learning_rate": 1.0641025469727356e-06, + "loss": 0.7087, + "step": 5603 + }, + { + "epoch": 2.780439369492367, + "grad_norm": 0.12900061673731678, + "learning_rate": 1.0633025948494014e-06, + "loss": 0.7028, + "step": 5604 + }, + { + "epoch": 2.780935832195606, + "grad_norm": 0.13016595296281186, + "learning_rate": 1.0625028623022305e-06, + "loss": 0.692, + "step": 5605 + }, + { + "epoch": 2.7814322948988455, + "grad_norm": 0.12885825274574203, + "learning_rate": 1.0617033494534486e-06, + "loss": 0.6744, + "step": 5606 + }, + { + "epoch": 2.7819287576020852, + "grad_norm": 0.12870191436048342, + "learning_rate": 1.0609040564252484e-06, + "loss": 0.6885, + "step": 5607 + }, + { + "epoch": 2.7824252203053246, + "grad_norm": 0.12629534065315853, + "learning_rate": 1.0601049833397892e-06, + "loss": 0.6859, + "step": 5608 + }, + { + "epoch": 2.782921683008564, + "grad_norm": 0.13520278743342204, + "learning_rate": 1.0593061303191954e-06, + "loss": 0.7816, + "step": 5609 + }, + { + "epoch": 2.7834181457118032, + "grad_norm": 0.12636004627171285, + "learning_rate": 1.0585074974855605e-06, + "loss": 0.7254, + "step": 5610 + }, + { + "epoch": 2.783914608415043, + "grad_norm": 0.1258511623485038, + "learning_rate": 1.0577090849609415e-06, + "loss": 0.668, + "step": 5611 + }, + { + "epoch": 2.7844110711182823, + "grad_norm": 0.1268446329679461, + "learning_rate": 1.0569108928673642e-06, + "loss": 0.6671, + "step": 5612 + }, + { + "epoch": 2.7849075338215217, + "grad_norm": 0.13239241247612957, + "learning_rate": 1.0561129213268187e-06, + "loss": 0.7689, + "step": 5613 + }, + { + "epoch": 2.785403996524761, + "grad_norm": 0.1255651588944757, + "learning_rate": 1.0553151704612614e-06, + "loss": 0.6884, + "step": 5614 + }, + { + "epoch": 2.7859004592280003, + "grad_norm": 0.12247730286780059, + "learning_rate": 1.0545176403926172e-06, + "loss": 0.6955, + "step": 5615 + }, + { + "epoch": 2.7863969219312397, + "grad_norm": 0.12958608013612596, + "learning_rate": 1.0537203312427752e-06, + "loss": 0.6482, + "step": 5616 + }, + { + "epoch": 2.7868933846344794, + "grad_norm": 0.1228441492665452, + "learning_rate": 1.0529232431335903e-06, + "loss": 0.6562, + "step": 5617 + }, + { + "epoch": 2.7873898473377188, + "grad_norm": 0.13327477747426705, + "learning_rate": 1.0521263761868866e-06, + "loss": 0.6961, + "step": 5618 + }, + { + "epoch": 2.787886310040958, + "grad_norm": 0.1350215485186969, + "learning_rate": 1.0513297305244507e-06, + "loss": 0.718, + "step": 5619 + }, + { + "epoch": 2.7883827727441974, + "grad_norm": 0.1308450394330969, + "learning_rate": 1.0505333062680383e-06, + "loss": 0.7255, + "step": 5620 + }, + { + "epoch": 2.788879235447437, + "grad_norm": 0.1280745953181878, + "learning_rate": 1.04973710353937e-06, + "loss": 0.7049, + "step": 5621 + }, + { + "epoch": 2.7893756981506765, + "grad_norm": 0.1311328777593363, + "learning_rate": 1.048941122460132e-06, + "loss": 0.7186, + "step": 5622 + }, + { + "epoch": 2.789872160853916, + "grad_norm": 0.12796332817853948, + "learning_rate": 1.0481453631519775e-06, + "loss": 0.6986, + "step": 5623 + }, + { + "epoch": 2.790368623557155, + "grad_norm": 0.13077728648409898, + "learning_rate": 1.0473498257365247e-06, + "loss": 0.7143, + "step": 5624 + }, + { + "epoch": 2.7908650862603945, + "grad_norm": 0.12948794453473453, + "learning_rate": 1.0465545103353605e-06, + "loss": 0.7077, + "step": 5625 + }, + { + "epoch": 2.791361548963634, + "grad_norm": 0.12937599313169448, + "learning_rate": 1.0457594170700342e-06, + "loss": 0.7617, + "step": 5626 + }, + { + "epoch": 2.7918580116668736, + "grad_norm": 0.12552421196790298, + "learning_rate": 1.044964546062065e-06, + "loss": 0.6971, + "step": 5627 + }, + { + "epoch": 2.792354474370113, + "grad_norm": 0.128355301555478, + "learning_rate": 1.0441698974329351e-06, + "loss": 0.6843, + "step": 5628 + }, + { + "epoch": 2.7928509370733523, + "grad_norm": 0.13577852067425666, + "learning_rate": 1.043375471304093e-06, + "loss": 0.6869, + "step": 5629 + }, + { + "epoch": 2.7933473997765916, + "grad_norm": 0.12864716094515163, + "learning_rate": 1.0425812677969558e-06, + "loss": 0.7449, + "step": 5630 + }, + { + "epoch": 2.7938438624798314, + "grad_norm": 0.1293266492985022, + "learning_rate": 1.0417872870329029e-06, + "loss": 0.6797, + "step": 5631 + }, + { + "epoch": 2.7943403251830707, + "grad_norm": 0.1225513439668755, + "learning_rate": 1.0409935291332838e-06, + "loss": 0.6412, + "step": 5632 + }, + { + "epoch": 2.79483678788631, + "grad_norm": 0.12592530737282465, + "learning_rate": 1.04019999421941e-06, + "loss": 0.6889, + "step": 5633 + }, + { + "epoch": 2.7953332505895494, + "grad_norm": 0.12572434567997356, + "learning_rate": 1.0394066824125604e-06, + "loss": 0.6637, + "step": 5634 + }, + { + "epoch": 2.7958297132927887, + "grad_norm": 0.12830761221708809, + "learning_rate": 1.0386135938339812e-06, + "loss": 0.7142, + "step": 5635 + }, + { + "epoch": 2.796326175996028, + "grad_norm": 0.12804406659034892, + "learning_rate": 1.0378207286048817e-06, + "loss": 0.6682, + "step": 5636 + }, + { + "epoch": 2.796822638699268, + "grad_norm": 0.1306245576613395, + "learning_rate": 1.0370280868464405e-06, + "loss": 0.6909, + "step": 5637 + }, + { + "epoch": 2.797319101402507, + "grad_norm": 0.13224450154587467, + "learning_rate": 1.0362356686797996e-06, + "loss": 0.7285, + "step": 5638 + }, + { + "epoch": 2.7978155641057465, + "grad_norm": 0.1265750971979006, + "learning_rate": 1.0354434742260665e-06, + "loss": 0.6563, + "step": 5639 + }, + { + "epoch": 2.798312026808986, + "grad_norm": 0.1255205324161622, + "learning_rate": 1.0346515036063165e-06, + "loss": 0.6817, + "step": 5640 + }, + { + "epoch": 2.7988084895122256, + "grad_norm": 0.12545489772850293, + "learning_rate": 1.033859756941588e-06, + "loss": 0.6674, + "step": 5641 + }, + { + "epoch": 2.799304952215465, + "grad_norm": 0.12709075941299228, + "learning_rate": 1.0330682343528886e-06, + "loss": 0.6704, + "step": 5642 + }, + { + "epoch": 2.7998014149187043, + "grad_norm": 0.1267872987559885, + "learning_rate": 1.0322769359611883e-06, + "loss": 0.6957, + "step": 5643 + }, + { + "epoch": 2.8002978776219436, + "grad_norm": 0.13612904687065247, + "learning_rate": 1.0314858618874263e-06, + "loss": 0.7679, + "step": 5644 + }, + { + "epoch": 2.800794340325183, + "grad_norm": 0.12963249148574502, + "learning_rate": 1.0306950122525042e-06, + "loss": 0.66, + "step": 5645 + }, + { + "epoch": 2.8012908030284223, + "grad_norm": 0.12775287910822225, + "learning_rate": 1.0299043871772904e-06, + "loss": 0.6721, + "step": 5646 + }, + { + "epoch": 2.801787265731662, + "grad_norm": 0.13139858927008313, + "learning_rate": 1.0291139867826205e-06, + "loss": 0.6754, + "step": 5647 + }, + { + "epoch": 2.8022837284349014, + "grad_norm": 0.13065899559643898, + "learning_rate": 1.0283238111892929e-06, + "loss": 0.6853, + "step": 5648 + }, + { + "epoch": 2.8027801911381407, + "grad_norm": 0.13055758162862297, + "learning_rate": 1.0275338605180751e-06, + "loss": 0.7194, + "step": 5649 + }, + { + "epoch": 2.80327665384138, + "grad_norm": 0.13385534399710408, + "learning_rate": 1.0267441348896978e-06, + "loss": 0.703, + "step": 5650 + }, + { + "epoch": 2.80377311654462, + "grad_norm": 0.12413925075970825, + "learning_rate": 1.0259546344248567e-06, + "loss": 0.6408, + "step": 5651 + }, + { + "epoch": 2.804269579247859, + "grad_norm": 0.13054886800472554, + "learning_rate": 1.0251653592442157e-06, + "loss": 0.7052, + "step": 5652 + }, + { + "epoch": 2.8047660419510985, + "grad_norm": 0.13047999060457022, + "learning_rate": 1.0243763094684029e-06, + "loss": 0.6872, + "step": 5653 + }, + { + "epoch": 2.805262504654338, + "grad_norm": 0.1273801577355027, + "learning_rate": 1.0235874852180109e-06, + "loss": 0.6741, + "step": 5654 + }, + { + "epoch": 2.805758967357577, + "grad_norm": 0.1324495467661752, + "learning_rate": 1.0227988866135995e-06, + "loss": 0.7374, + "step": 5655 + }, + { + "epoch": 2.8062554300608165, + "grad_norm": 0.12792054360102523, + "learning_rate": 1.0220105137756923e-06, + "loss": 0.7069, + "step": 5656 + }, + { + "epoch": 2.8067518927640562, + "grad_norm": 0.1351583667186328, + "learning_rate": 1.021222366824781e-06, + "loss": 0.7174, + "step": 5657 + }, + { + "epoch": 2.8072483554672956, + "grad_norm": 0.13071189164436514, + "learning_rate": 1.0204344458813201e-06, + "loss": 0.7505, + "step": 5658 + }, + { + "epoch": 2.807744818170535, + "grad_norm": 0.12840479043760866, + "learning_rate": 1.0196467510657315e-06, + "loss": 0.7161, + "step": 5659 + }, + { + "epoch": 2.8082412808737742, + "grad_norm": 0.12700151293719608, + "learning_rate": 1.0188592824984018e-06, + "loss": 0.713, + "step": 5660 + }, + { + "epoch": 2.808737743577014, + "grad_norm": 0.1246492088887146, + "learning_rate": 1.0180720402996814e-06, + "loss": 0.698, + "step": 5661 + }, + { + "epoch": 2.8092342062802533, + "grad_norm": 0.13557496235898936, + "learning_rate": 1.0172850245898893e-06, + "loss": 0.7027, + "step": 5662 + }, + { + "epoch": 2.8097306689834927, + "grad_norm": 0.13048869432963878, + "learning_rate": 1.0164982354893072e-06, + "loss": 0.7034, + "step": 5663 + }, + { + "epoch": 2.810227131686732, + "grad_norm": 0.1309842813142355, + "learning_rate": 1.0157116731181844e-06, + "loss": 0.7185, + "step": 5664 + }, + { + "epoch": 2.8107235943899713, + "grad_norm": 0.12466311030204848, + "learning_rate": 1.0149253375967336e-06, + "loss": 0.6622, + "step": 5665 + }, + { + "epoch": 2.8112200570932107, + "grad_norm": 0.12962837245694803, + "learning_rate": 1.0141392290451327e-06, + "loss": 0.6915, + "step": 5666 + }, + { + "epoch": 2.8117165197964504, + "grad_norm": 0.12989363238833468, + "learning_rate": 1.0133533475835275e-06, + "loss": 0.7217, + "step": 5667 + }, + { + "epoch": 2.8122129824996898, + "grad_norm": 0.12706569640249818, + "learning_rate": 1.0125676933320258e-06, + "loss": 0.6702, + "step": 5668 + }, + { + "epoch": 2.812709445202929, + "grad_norm": 0.1304053742764276, + "learning_rate": 1.0117822664107039e-06, + "loss": 0.6888, + "step": 5669 + }, + { + "epoch": 2.8132059079061684, + "grad_norm": 0.12731183078943037, + "learning_rate": 1.0109970669396008e-06, + "loss": 0.7023, + "step": 5670 + }, + { + "epoch": 2.813702370609408, + "grad_norm": 0.12732905106726986, + "learning_rate": 1.0102120950387217e-06, + "loss": 0.7226, + "step": 5671 + }, + { + "epoch": 2.8141988333126475, + "grad_norm": 0.1306176241196579, + "learning_rate": 1.0094273508280369e-06, + "loss": 0.7277, + "step": 5672 + }, + { + "epoch": 2.814695296015887, + "grad_norm": 0.12671882523359002, + "learning_rate": 1.008642834427481e-06, + "loss": 0.7285, + "step": 5673 + }, + { + "epoch": 2.815191758719126, + "grad_norm": 0.13283779420386785, + "learning_rate": 1.0078585459569568e-06, + "loss": 0.7146, + "step": 5674 + }, + { + "epoch": 2.8156882214223655, + "grad_norm": 0.12814418090121601, + "learning_rate": 1.0070744855363283e-06, + "loss": 0.7038, + "step": 5675 + }, + { + "epoch": 2.816184684125605, + "grad_norm": 0.12617325806218707, + "learning_rate": 1.0062906532854284e-06, + "loss": 0.6782, + "step": 5676 + }, + { + "epoch": 2.8166811468288446, + "grad_norm": 0.1304282198590571, + "learning_rate": 1.0055070493240521e-06, + "loss": 0.7038, + "step": 5677 + }, + { + "epoch": 2.817177609532084, + "grad_norm": 0.12890829179767882, + "learning_rate": 1.00472367377196e-06, + "loss": 0.6802, + "step": 5678 + }, + { + "epoch": 2.8176740722353233, + "grad_norm": 0.13108770095262734, + "learning_rate": 1.0039405267488805e-06, + "loss": 0.7088, + "step": 5679 + }, + { + "epoch": 2.8181705349385626, + "grad_norm": 0.12760496474052235, + "learning_rate": 1.003157608374503e-06, + "loss": 0.6967, + "step": 5680 + }, + { + "epoch": 2.8186669976418024, + "grad_norm": 0.12724173087136592, + "learning_rate": 1.002374918768486e-06, + "loss": 0.6888, + "step": 5681 + }, + { + "epoch": 2.8191634603450417, + "grad_norm": 0.1281546403873397, + "learning_rate": 1.0015924580504502e-06, + "loss": 0.6831, + "step": 5682 + }, + { + "epoch": 2.819659923048281, + "grad_norm": 0.1306599446069587, + "learning_rate": 1.000810226339981e-06, + "loss": 0.6949, + "step": 5683 + }, + { + "epoch": 2.8201563857515204, + "grad_norm": 0.13275998082707438, + "learning_rate": 1.000028223756632e-06, + "loss": 0.7333, + "step": 5684 + }, + { + "epoch": 2.8206528484547597, + "grad_norm": 0.1259944276365557, + "learning_rate": 9.99246450419918e-07, + "loss": 0.7038, + "step": 5685 + }, + { + "epoch": 2.821149311157999, + "grad_norm": 0.1258776313988055, + "learning_rate": 9.984649064493228e-07, + "loss": 0.6935, + "step": 5686 + }, + { + "epoch": 2.821645773861239, + "grad_norm": 0.1277780519819066, + "learning_rate": 9.976835919642897e-07, + "loss": 0.6888, + "step": 5687 + }, + { + "epoch": 2.822142236564478, + "grad_norm": 0.1255875204317941, + "learning_rate": 9.969025070842326e-07, + "loss": 0.7692, + "step": 5688 + }, + { + "epoch": 2.8226386992677175, + "grad_norm": 0.12833215364936337, + "learning_rate": 9.961216519285272e-07, + "loss": 0.6744, + "step": 5689 + }, + { + "epoch": 2.823135161970957, + "grad_norm": 0.1316740562970518, + "learning_rate": 9.953410266165131e-07, + "loss": 0.7523, + "step": 5690 + }, + { + "epoch": 2.8236316246741966, + "grad_norm": 0.12733567410451435, + "learning_rate": 9.945606312674991e-07, + "loss": 0.6814, + "step": 5691 + }, + { + "epoch": 2.824128087377436, + "grad_norm": 0.13152963950218258, + "learning_rate": 9.937804660007545e-07, + "loss": 0.7384, + "step": 5692 + }, + { + "epoch": 2.8246245500806753, + "grad_norm": 0.12905971140335962, + "learning_rate": 9.930005309355143e-07, + "loss": 0.6819, + "step": 5693 + }, + { + "epoch": 2.8251210127839146, + "grad_norm": 0.1293289412286106, + "learning_rate": 9.922208261909811e-07, + "loss": 0.6689, + "step": 5694 + }, + { + "epoch": 2.825617475487154, + "grad_norm": 0.12656866487417895, + "learning_rate": 9.91441351886318e-07, + "loss": 0.671, + "step": 5695 + }, + { + "epoch": 2.8261139381903932, + "grad_norm": 0.12682407823609218, + "learning_rate": 9.906621081406575e-07, + "loss": 0.7023, + "step": 5696 + }, + { + "epoch": 2.826610400893633, + "grad_norm": 0.12481510953803818, + "learning_rate": 9.898830950730935e-07, + "loss": 0.7291, + "step": 5697 + }, + { + "epoch": 2.8271068635968724, + "grad_norm": 0.13429126569598215, + "learning_rate": 9.891043128026846e-07, + "loss": 0.75, + "step": 5698 + }, + { + "epoch": 2.8276033263001117, + "grad_norm": 0.12713270388437029, + "learning_rate": 9.883257614484568e-07, + "loss": 0.6696, + "step": 5699 + }, + { + "epoch": 2.828099789003351, + "grad_norm": 0.13204195693146775, + "learning_rate": 9.875474411293977e-07, + "loss": 0.7444, + "step": 5700 + }, + { + "epoch": 2.828596251706591, + "grad_norm": 0.12538390670171082, + "learning_rate": 9.86769351964463e-07, + "loss": 0.6943, + "step": 5701 + }, + { + "epoch": 2.82909271440983, + "grad_norm": 0.12942426088477443, + "learning_rate": 9.859914940725698e-07, + "loss": 0.6697, + "step": 5702 + }, + { + "epoch": 2.8295891771130695, + "grad_norm": 0.13073956206544834, + "learning_rate": 9.852138675726014e-07, + "loss": 0.7027, + "step": 5703 + }, + { + "epoch": 2.830085639816309, + "grad_norm": 0.13192378873945426, + "learning_rate": 9.844364725834058e-07, + "loss": 0.6702, + "step": 5704 + }, + { + "epoch": 2.830582102519548, + "grad_norm": 0.13045843127382814, + "learning_rate": 9.836593092237942e-07, + "loss": 0.7186, + "step": 5705 + }, + { + "epoch": 2.8310785652227874, + "grad_norm": 0.13209103372271133, + "learning_rate": 9.828823776125455e-07, + "loss": 0.6726, + "step": 5706 + }, + { + "epoch": 2.831575027926027, + "grad_norm": 0.12836854761747843, + "learning_rate": 9.82105677868399e-07, + "loss": 0.6604, + "step": 5707 + }, + { + "epoch": 2.8320714906292666, + "grad_norm": 0.12440008092199902, + "learning_rate": 9.813292101100634e-07, + "loss": 0.6939, + "step": 5708 + }, + { + "epoch": 2.832567953332506, + "grad_norm": 0.13509346075273426, + "learning_rate": 9.805529744562076e-07, + "loss": 0.7395, + "step": 5709 + }, + { + "epoch": 2.833064416035745, + "grad_norm": 0.12453204951253816, + "learning_rate": 9.797769710254664e-07, + "loss": 0.6882, + "step": 5710 + }, + { + "epoch": 2.833560878738985, + "grad_norm": 0.12697842177012855, + "learning_rate": 9.790011999364412e-07, + "loss": 0.7063, + "step": 5711 + }, + { + "epoch": 2.8340573414422243, + "grad_norm": 0.12754534007734417, + "learning_rate": 9.782256613076945e-07, + "loss": 0.7482, + "step": 5712 + }, + { + "epoch": 2.8345538041454637, + "grad_norm": 0.13069481659575172, + "learning_rate": 9.774503552577563e-07, + "loss": 0.6936, + "step": 5713 + }, + { + "epoch": 2.835050266848703, + "grad_norm": 0.13974772554638237, + "learning_rate": 9.766752819051192e-07, + "loss": 0.717, + "step": 5714 + }, + { + "epoch": 2.8355467295519423, + "grad_norm": 0.1251939515003143, + "learning_rate": 9.759004413682396e-07, + "loss": 0.6509, + "step": 5715 + }, + { + "epoch": 2.8360431922551816, + "grad_norm": 0.1312396524534019, + "learning_rate": 9.751258337655418e-07, + "loss": 0.7324, + "step": 5716 + }, + { + "epoch": 2.836539654958421, + "grad_norm": 0.13060634236111196, + "learning_rate": 9.743514592154093e-07, + "loss": 0.732, + "step": 5717 + }, + { + "epoch": 2.8370361176616608, + "grad_norm": 0.12983746553461947, + "learning_rate": 9.735773178361965e-07, + "loss": 0.7217, + "step": 5718 + }, + { + "epoch": 2.8375325803649, + "grad_norm": 0.1313030207597098, + "learning_rate": 9.728034097462144e-07, + "loss": 0.7298, + "step": 5719 + }, + { + "epoch": 2.8380290430681394, + "grad_norm": 0.1305730115816129, + "learning_rate": 9.720297350637453e-07, + "loss": 0.73, + "step": 5720 + }, + { + "epoch": 2.838525505771379, + "grad_norm": 0.12959419549059026, + "learning_rate": 9.712562939070322e-07, + "loss": 0.7013, + "step": 5721 + }, + { + "epoch": 2.8390219684746185, + "grad_norm": 0.13353606316263741, + "learning_rate": 9.704830863942819e-07, + "loss": 0.6893, + "step": 5722 + }, + { + "epoch": 2.839518431177858, + "grad_norm": 0.12927989012251478, + "learning_rate": 9.697101126436689e-07, + "loss": 0.6941, + "step": 5723 + }, + { + "epoch": 2.840014893881097, + "grad_norm": 0.12460555850658463, + "learning_rate": 9.689373727733282e-07, + "loss": 0.6749, + "step": 5724 + }, + { + "epoch": 2.8405113565843365, + "grad_norm": 0.13017346927566242, + "learning_rate": 9.681648669013618e-07, + "loss": 0.6884, + "step": 5725 + }, + { + "epoch": 2.841007819287576, + "grad_norm": 0.12587233851865584, + "learning_rate": 9.673925951458347e-07, + "loss": 0.6782, + "step": 5726 + }, + { + "epoch": 2.841504281990815, + "grad_norm": 0.12769491177786568, + "learning_rate": 9.66620557624775e-07, + "loss": 0.7175, + "step": 5727 + }, + { + "epoch": 2.842000744694055, + "grad_norm": 0.13179110270409086, + "learning_rate": 9.658487544561778e-07, + "loss": 0.7183, + "step": 5728 + }, + { + "epoch": 2.8424972073972943, + "grad_norm": 0.13106533480871427, + "learning_rate": 9.650771857580007e-07, + "loss": 0.7449, + "step": 5729 + }, + { + "epoch": 2.8429936701005336, + "grad_norm": 0.12932936151209334, + "learning_rate": 9.64305851648164e-07, + "loss": 0.7064, + "step": 5730 + }, + { + "epoch": 2.8434901328037734, + "grad_norm": 0.13312505296398447, + "learning_rate": 9.63534752244556e-07, + "loss": 0.763, + "step": 5731 + }, + { + "epoch": 2.8439865955070127, + "grad_norm": 0.12320453586612351, + "learning_rate": 9.627638876650245e-07, + "loss": 0.6423, + "step": 5732 + }, + { + "epoch": 2.844483058210252, + "grad_norm": 0.12794077575330426, + "learning_rate": 9.61993258027386e-07, + "loss": 0.6768, + "step": 5733 + }, + { + "epoch": 2.8449795209134914, + "grad_norm": 0.1266752168624091, + "learning_rate": 9.612228634494184e-07, + "loss": 0.6743, + "step": 5734 + }, + { + "epoch": 2.8454759836167307, + "grad_norm": 0.12653827532865938, + "learning_rate": 9.604527040488631e-07, + "loss": 0.6873, + "step": 5735 + }, + { + "epoch": 2.84597244631997, + "grad_norm": 0.13105455883016892, + "learning_rate": 9.596827799434278e-07, + "loss": 0.7279, + "step": 5736 + }, + { + "epoch": 2.8464689090232094, + "grad_norm": 0.12824418068445376, + "learning_rate": 9.589130912507812e-07, + "loss": 0.6876, + "step": 5737 + }, + { + "epoch": 2.846965371726449, + "grad_norm": 0.12298579417263769, + "learning_rate": 9.581436380885604e-07, + "loss": 0.6416, + "step": 5738 + }, + { + "epoch": 2.8474618344296885, + "grad_norm": 0.12704521892342857, + "learning_rate": 9.573744205743613e-07, + "loss": 0.7144, + "step": 5739 + }, + { + "epoch": 2.847958297132928, + "grad_norm": 0.13151764091738496, + "learning_rate": 9.566054388257492e-07, + "loss": 0.6985, + "step": 5740 + }, + { + "epoch": 2.8484547598361676, + "grad_norm": 0.12281688304650965, + "learning_rate": 9.558366929602492e-07, + "loss": 0.6525, + "step": 5741 + }, + { + "epoch": 2.848951222539407, + "grad_norm": 0.12985743528766544, + "learning_rate": 9.55068183095351e-07, + "loss": 0.7118, + "step": 5742 + }, + { + "epoch": 2.8494476852426462, + "grad_norm": 0.12697246716589147, + "learning_rate": 9.542999093485108e-07, + "loss": 0.686, + "step": 5743 + }, + { + "epoch": 2.8499441479458856, + "grad_norm": 0.12891175104431518, + "learning_rate": 9.535318718371453e-07, + "loss": 0.7149, + "step": 5744 + }, + { + "epoch": 2.850440610649125, + "grad_norm": 0.12952157865563893, + "learning_rate": 9.527640706786381e-07, + "loss": 0.6881, + "step": 5745 + }, + { + "epoch": 2.8509370733523642, + "grad_norm": 0.13148893826296196, + "learning_rate": 9.519965059903349e-07, + "loss": 0.6798, + "step": 5746 + }, + { + "epoch": 2.8514335360556036, + "grad_norm": 0.13521982035020255, + "learning_rate": 9.512291778895444e-07, + "loss": 0.7152, + "step": 5747 + }, + { + "epoch": 2.8519299987588433, + "grad_norm": 0.13512198272074397, + "learning_rate": 9.504620864935421e-07, + "loss": 0.7148, + "step": 5748 + }, + { + "epoch": 2.8524264614620827, + "grad_norm": 0.1266177814142838, + "learning_rate": 9.496952319195643e-07, + "loss": 0.6658, + "step": 5749 + }, + { + "epoch": 2.852922924165322, + "grad_norm": 0.12979818368870435, + "learning_rate": 9.489286142848148e-07, + "loss": 0.7075, + "step": 5750 + }, + { + "epoch": 2.8534193868685613, + "grad_norm": 0.12554115281420927, + "learning_rate": 9.481622337064552e-07, + "loss": 0.6843, + "step": 5751 + }, + { + "epoch": 2.853915849571801, + "grad_norm": 0.1320343536961316, + "learning_rate": 9.473960903016175e-07, + "loss": 0.6923, + "step": 5752 + }, + { + "epoch": 2.8544123122750404, + "grad_norm": 0.12386913184376706, + "learning_rate": 9.466301841873929e-07, + "loss": 0.6594, + "step": 5753 + }, + { + "epoch": 2.8549087749782798, + "grad_norm": 0.13316531050788066, + "learning_rate": 9.458645154808377e-07, + "loss": 0.7124, + "step": 5754 + }, + { + "epoch": 2.855405237681519, + "grad_norm": 0.1300731124347825, + "learning_rate": 9.450990842989732e-07, + "loss": 0.709, + "step": 5755 + }, + { + "epoch": 2.8559017003847584, + "grad_norm": 0.12945665805477527, + "learning_rate": 9.443338907587821e-07, + "loss": 0.7232, + "step": 5756 + }, + { + "epoch": 2.8563981630879978, + "grad_norm": 0.13305413501745098, + "learning_rate": 9.435689349772135e-07, + "loss": 0.7081, + "step": 5757 + }, + { + "epoch": 2.8568946257912375, + "grad_norm": 0.1283285050578409, + "learning_rate": 9.428042170711776e-07, + "loss": 0.7441, + "step": 5758 + }, + { + "epoch": 2.857391088494477, + "grad_norm": 0.13256787722435986, + "learning_rate": 9.420397371575485e-07, + "loss": 0.7147, + "step": 5759 + }, + { + "epoch": 2.857887551197716, + "grad_norm": 0.13501707551704034, + "learning_rate": 9.412754953531664e-07, + "loss": 0.7473, + "step": 5760 + }, + { + "epoch": 2.8583840139009555, + "grad_norm": 0.12779455860264854, + "learning_rate": 9.405114917748318e-07, + "loss": 0.6941, + "step": 5761 + }, + { + "epoch": 2.8588804766041953, + "grad_norm": 0.13350164440946205, + "learning_rate": 9.397477265393121e-07, + "loss": 0.7079, + "step": 5762 + }, + { + "epoch": 2.8593769393074346, + "grad_norm": 0.12695906616841662, + "learning_rate": 9.389841997633356e-07, + "loss": 0.7138, + "step": 5763 + }, + { + "epoch": 2.859873402010674, + "grad_norm": 0.1339764757838581, + "learning_rate": 9.382209115635942e-07, + "loss": 0.686, + "step": 5764 + }, + { + "epoch": 2.8603698647139133, + "grad_norm": 0.13278274095995127, + "learning_rate": 9.374578620567462e-07, + "loss": 0.7039, + "step": 5765 + }, + { + "epoch": 2.8608663274171526, + "grad_norm": 0.1277777239083202, + "learning_rate": 9.366950513594106e-07, + "loss": 0.7353, + "step": 5766 + }, + { + "epoch": 2.861362790120392, + "grad_norm": 0.12832944909129318, + "learning_rate": 9.359324795881708e-07, + "loss": 0.726, + "step": 5767 + }, + { + "epoch": 2.8618592528236317, + "grad_norm": 0.12594067772100392, + "learning_rate": 9.351701468595734e-07, + "loss": 0.7546, + "step": 5768 + }, + { + "epoch": 2.862355715526871, + "grad_norm": 0.128006328679492, + "learning_rate": 9.344080532901282e-07, + "loss": 0.708, + "step": 5769 + }, + { + "epoch": 2.8628521782301104, + "grad_norm": 0.1275045024635113, + "learning_rate": 9.336461989963102e-07, + "loss": 0.6734, + "step": 5770 + }, + { + "epoch": 2.8633486409333497, + "grad_norm": 0.1414589237121449, + "learning_rate": 9.328845840945555e-07, + "loss": 0.7276, + "step": 5771 + }, + { + "epoch": 2.8638451036365895, + "grad_norm": 0.12748603017529134, + "learning_rate": 9.321232087012664e-07, + "loss": 0.6894, + "step": 5772 + }, + { + "epoch": 2.864341566339829, + "grad_norm": 0.12968727130669763, + "learning_rate": 9.313620729328055e-07, + "loss": 0.6977, + "step": 5773 + }, + { + "epoch": 2.864838029043068, + "grad_norm": 0.12740734697560116, + "learning_rate": 9.306011769054999e-07, + "loss": 0.702, + "step": 5774 + }, + { + "epoch": 2.8653344917463075, + "grad_norm": 0.13202429187909928, + "learning_rate": 9.298405207356418e-07, + "loss": 0.7235, + "step": 5775 + }, + { + "epoch": 2.865830954449547, + "grad_norm": 0.12516118343049698, + "learning_rate": 9.290801045394837e-07, + "loss": 0.6962, + "step": 5776 + }, + { + "epoch": 2.866327417152786, + "grad_norm": 0.12466908791148393, + "learning_rate": 9.283199284332448e-07, + "loss": 0.6828, + "step": 5777 + }, + { + "epoch": 2.866823879856026, + "grad_norm": 0.13168847619221924, + "learning_rate": 9.275599925331047e-07, + "loss": 0.7276, + "step": 5778 + }, + { + "epoch": 2.8673203425592653, + "grad_norm": 0.12380760924529202, + "learning_rate": 9.268002969552068e-07, + "loss": 0.7109, + "step": 5779 + }, + { + "epoch": 2.8678168052625046, + "grad_norm": 0.1273101006035769, + "learning_rate": 9.260408418156597e-07, + "loss": 0.7053, + "step": 5780 + }, + { + "epoch": 2.868313267965744, + "grad_norm": 0.12716786520734547, + "learning_rate": 9.252816272305328e-07, + "loss": 0.6563, + "step": 5781 + }, + { + "epoch": 2.8688097306689837, + "grad_norm": 0.12534292127359717, + "learning_rate": 9.245226533158624e-07, + "loss": 0.6822, + "step": 5782 + }, + { + "epoch": 2.869306193372223, + "grad_norm": 0.12864761504663866, + "learning_rate": 9.237639201876417e-07, + "loss": 0.7303, + "step": 5783 + }, + { + "epoch": 2.8698026560754624, + "grad_norm": 0.12941975850294216, + "learning_rate": 9.230054279618336e-07, + "loss": 0.6791, + "step": 5784 + }, + { + "epoch": 2.8702991187787017, + "grad_norm": 0.12715616195640328, + "learning_rate": 9.222471767543608e-07, + "loss": 0.677, + "step": 5785 + }, + { + "epoch": 2.870795581481941, + "grad_norm": 0.1354882762503265, + "learning_rate": 9.214891666811088e-07, + "loss": 0.7547, + "step": 5786 + }, + { + "epoch": 2.8712920441851804, + "grad_norm": 0.12912527608146743, + "learning_rate": 9.207313978579289e-07, + "loss": 0.6888, + "step": 5787 + }, + { + "epoch": 2.87178850688842, + "grad_norm": 0.1349499512563821, + "learning_rate": 9.199738704006322e-07, + "loss": 0.7308, + "step": 5788 + }, + { + "epoch": 2.8722849695916595, + "grad_norm": 0.1306875820845483, + "learning_rate": 9.192165844249967e-07, + "loss": 0.6544, + "step": 5789 + }, + { + "epoch": 2.872781432294899, + "grad_norm": 0.1284137449808761, + "learning_rate": 9.184595400467605e-07, + "loss": 0.6812, + "step": 5790 + }, + { + "epoch": 2.873277894998138, + "grad_norm": 0.12848694849478118, + "learning_rate": 9.177027373816244e-07, + "loss": 0.7192, + "step": 5791 + }, + { + "epoch": 2.873774357701378, + "grad_norm": 0.12937550210725762, + "learning_rate": 9.169461765452556e-07, + "loss": 0.6748, + "step": 5792 + }, + { + "epoch": 2.8742708204046172, + "grad_norm": 0.12782097735362807, + "learning_rate": 9.161898576532805e-07, + "loss": 0.6797, + "step": 5793 + }, + { + "epoch": 2.8747672831078566, + "grad_norm": 0.1305376367487405, + "learning_rate": 9.154337808212921e-07, + "loss": 0.6827, + "step": 5794 + }, + { + "epoch": 2.875263745811096, + "grad_norm": 0.12880050176675548, + "learning_rate": 9.146779461648437e-07, + "loss": 0.6894, + "step": 5795 + }, + { + "epoch": 2.8757602085143352, + "grad_norm": 0.1259680592660518, + "learning_rate": 9.139223537994519e-07, + "loss": 0.6695, + "step": 5796 + }, + { + "epoch": 2.8762566712175746, + "grad_norm": 0.12642296565251043, + "learning_rate": 9.131670038405979e-07, + "loss": 0.674, + "step": 5797 + }, + { + "epoch": 2.8767531339208143, + "grad_norm": 0.12515161938813915, + "learning_rate": 9.124118964037246e-07, + "loss": 0.6419, + "step": 5798 + }, + { + "epoch": 2.8772495966240537, + "grad_norm": 0.13621267258149164, + "learning_rate": 9.11657031604238e-07, + "loss": 0.764, + "step": 5799 + }, + { + "epoch": 2.877746059327293, + "grad_norm": 0.1267504655200429, + "learning_rate": 9.109024095575062e-07, + "loss": 0.6827, + "step": 5800 + }, + { + "epoch": 2.8782425220305323, + "grad_norm": 0.12834380780687052, + "learning_rate": 9.101480303788623e-07, + "loss": 0.7056, + "step": 5801 + }, + { + "epoch": 2.878738984733772, + "grad_norm": 0.14054019031479334, + "learning_rate": 9.093938941836012e-07, + "loss": 0.7741, + "step": 5802 + }, + { + "epoch": 2.8792354474370114, + "grad_norm": 0.1292406739225414, + "learning_rate": 9.086400010869787e-07, + "loss": 0.6723, + "step": 5803 + }, + { + "epoch": 2.8797319101402508, + "grad_norm": 0.13155167047598984, + "learning_rate": 9.078863512042174e-07, + "loss": 0.6905, + "step": 5804 + }, + { + "epoch": 2.88022837284349, + "grad_norm": 0.128899843641201, + "learning_rate": 9.071329446504997e-07, + "loss": 0.6903, + "step": 5805 + }, + { + "epoch": 2.8807248355467294, + "grad_norm": 0.12938294267895303, + "learning_rate": 9.063797815409711e-07, + "loss": 0.7468, + "step": 5806 + }, + { + "epoch": 2.8812212982499688, + "grad_norm": 0.1302289199669989, + "learning_rate": 9.056268619907418e-07, + "loss": 0.704, + "step": 5807 + }, + { + "epoch": 2.8817177609532085, + "grad_norm": 0.12864445639975267, + "learning_rate": 9.048741861148822e-07, + "loss": 0.6994, + "step": 5808 + }, + { + "epoch": 2.882214223656448, + "grad_norm": 0.12590829953054358, + "learning_rate": 9.041217540284278e-07, + "loss": 0.6584, + "step": 5809 + }, + { + "epoch": 2.882710686359687, + "grad_norm": 0.13423088059524024, + "learning_rate": 9.033695658463757e-07, + "loss": 0.705, + "step": 5810 + }, + { + "epoch": 2.8832071490629265, + "grad_norm": 0.12841940675440747, + "learning_rate": 9.026176216836844e-07, + "loss": 0.6997, + "step": 5811 + }, + { + "epoch": 2.8837036117661663, + "grad_norm": 0.1301253908505875, + "learning_rate": 9.018659216552783e-07, + "loss": 0.7129, + "step": 5812 + }, + { + "epoch": 2.8842000744694056, + "grad_norm": 0.1280984017417436, + "learning_rate": 9.011144658760413e-07, + "loss": 0.7191, + "step": 5813 + }, + { + "epoch": 2.884696537172645, + "grad_norm": 0.12267985245047328, + "learning_rate": 9.003632544608235e-07, + "loss": 0.6914, + "step": 5814 + }, + { + "epoch": 2.8851929998758843, + "grad_norm": 0.1287583871760695, + "learning_rate": 8.996122875244325e-07, + "loss": 0.728, + "step": 5815 + }, + { + "epoch": 2.8856894625791236, + "grad_norm": 0.12382625301443378, + "learning_rate": 8.98861565181644e-07, + "loss": 0.6676, + "step": 5816 + }, + { + "epoch": 2.886185925282363, + "grad_norm": 0.13281271821429838, + "learning_rate": 8.981110875471929e-07, + "loss": 0.7116, + "step": 5817 + }, + { + "epoch": 2.8866823879856027, + "grad_norm": 0.13692553089465404, + "learning_rate": 8.973608547357768e-07, + "loss": 0.7455, + "step": 5818 + }, + { + "epoch": 2.887178850688842, + "grad_norm": 0.1289222222449465, + "learning_rate": 8.966108668620585e-07, + "loss": 0.7099, + "step": 5819 + }, + { + "epoch": 2.8876753133920814, + "grad_norm": 0.1312399620437457, + "learning_rate": 8.958611240406601e-07, + "loss": 0.7066, + "step": 5820 + }, + { + "epoch": 2.8881717760953207, + "grad_norm": 0.13075456926805842, + "learning_rate": 8.951116263861693e-07, + "loss": 0.6928, + "step": 5821 + }, + { + "epoch": 2.8886682387985605, + "grad_norm": 0.128303038865835, + "learning_rate": 8.943623740131338e-07, + "loss": 0.6977, + "step": 5822 + }, + { + "epoch": 2.8891647015018, + "grad_norm": 0.13374921271432447, + "learning_rate": 8.936133670360644e-07, + "loss": 0.6792, + "step": 5823 + }, + { + "epoch": 2.889661164205039, + "grad_norm": 0.13148186697294473, + "learning_rate": 8.928646055694362e-07, + "loss": 0.7183, + "step": 5824 + }, + { + "epoch": 2.8901576269082785, + "grad_norm": 0.1288745562512283, + "learning_rate": 8.921160897276837e-07, + "loss": 0.707, + "step": 5825 + }, + { + "epoch": 2.890654089611518, + "grad_norm": 0.13355621762460396, + "learning_rate": 8.913678196252071e-07, + "loss": 0.7678, + "step": 5826 + }, + { + "epoch": 2.891150552314757, + "grad_norm": 0.1241939860039103, + "learning_rate": 8.90619795376367e-07, + "loss": 0.6613, + "step": 5827 + }, + { + "epoch": 2.891647015017997, + "grad_norm": 0.13480780396182257, + "learning_rate": 8.898720170954858e-07, + "loss": 0.7461, + "step": 5828 + }, + { + "epoch": 2.8921434777212363, + "grad_norm": 0.13949524462587928, + "learning_rate": 8.891244848968514e-07, + "loss": 0.8129, + "step": 5829 + }, + { + "epoch": 2.8926399404244756, + "grad_norm": 0.13174343677934816, + "learning_rate": 8.883771988947099e-07, + "loss": 0.7369, + "step": 5830 + }, + { + "epoch": 2.893136403127715, + "grad_norm": 0.1329364719930937, + "learning_rate": 8.876301592032749e-07, + "loss": 0.7148, + "step": 5831 + }, + { + "epoch": 2.8936328658309547, + "grad_norm": 0.1272203938665379, + "learning_rate": 8.868833659367163e-07, + "loss": 0.6465, + "step": 5832 + }, + { + "epoch": 2.894129328534194, + "grad_norm": 0.1293924264978683, + "learning_rate": 8.861368192091713e-07, + "loss": 0.6836, + "step": 5833 + }, + { + "epoch": 2.8946257912374334, + "grad_norm": 0.13220400859603937, + "learning_rate": 8.853905191347373e-07, + "loss": 0.7098, + "step": 5834 + }, + { + "epoch": 2.8951222539406727, + "grad_norm": 0.13015024295581795, + "learning_rate": 8.846444658274733e-07, + "loss": 0.6838, + "step": 5835 + }, + { + "epoch": 2.895618716643912, + "grad_norm": 0.13216797055404755, + "learning_rate": 8.838986594014034e-07, + "loss": 0.7106, + "step": 5836 + }, + { + "epoch": 2.8961151793471513, + "grad_norm": 0.1278107756500494, + "learning_rate": 8.831530999705104e-07, + "loss": 0.7313, + "step": 5837 + }, + { + "epoch": 2.896611642050391, + "grad_norm": 0.12908014728449954, + "learning_rate": 8.824077876487424e-07, + "loss": 0.7177, + "step": 5838 + }, + { + "epoch": 2.8971081047536305, + "grad_norm": 0.15225280744669853, + "learning_rate": 8.816627225500082e-07, + "loss": 0.6728, + "step": 5839 + }, + { + "epoch": 2.89760456745687, + "grad_norm": 0.12843463953521542, + "learning_rate": 8.80917904788178e-07, + "loss": 0.6796, + "step": 5840 + }, + { + "epoch": 2.898101030160109, + "grad_norm": 0.125053238809787, + "learning_rate": 8.801733344770869e-07, + "loss": 0.6684, + "step": 5841 + }, + { + "epoch": 2.898597492863349, + "grad_norm": 0.12924011340437813, + "learning_rate": 8.794290117305296e-07, + "loss": 0.6866, + "step": 5842 + }, + { + "epoch": 2.899093955566588, + "grad_norm": 0.12939827655850278, + "learning_rate": 8.786849366622629e-07, + "loss": 0.6713, + "step": 5843 + }, + { + "epoch": 2.8995904182698276, + "grad_norm": 0.1369447430816981, + "learning_rate": 8.77941109386009e-07, + "loss": 0.7056, + "step": 5844 + }, + { + "epoch": 2.900086880973067, + "grad_norm": 0.13371285950295925, + "learning_rate": 8.771975300154478e-07, + "loss": 0.703, + "step": 5845 + }, + { + "epoch": 2.900583343676306, + "grad_norm": 0.13319591109450518, + "learning_rate": 8.764541986642253e-07, + "loss": 0.6762, + "step": 5846 + }, + { + "epoch": 2.9010798063795455, + "grad_norm": 0.12748929289799496, + "learning_rate": 8.757111154459472e-07, + "loss": 0.6883, + "step": 5847 + }, + { + "epoch": 2.9015762690827853, + "grad_norm": 0.13341860100028502, + "learning_rate": 8.749682804741813e-07, + "loss": 0.7439, + "step": 5848 + }, + { + "epoch": 2.9020727317860247, + "grad_norm": 0.13140731624898971, + "learning_rate": 8.742256938624585e-07, + "loss": 0.6919, + "step": 5849 + }, + { + "epoch": 2.902569194489264, + "grad_norm": 0.126423779251477, + "learning_rate": 8.7348335572427e-07, + "loss": 0.6907, + "step": 5850 + }, + { + "epoch": 2.9030656571925033, + "grad_norm": 0.12587303898494598, + "learning_rate": 8.727412661730724e-07, + "loss": 0.6969, + "step": 5851 + }, + { + "epoch": 2.903562119895743, + "grad_norm": 0.12579584782238049, + "learning_rate": 8.719994253222805e-07, + "loss": 0.7108, + "step": 5852 + }, + { + "epoch": 2.9040585825989824, + "grad_norm": 0.13036032819321902, + "learning_rate": 8.712578332852739e-07, + "loss": 0.6665, + "step": 5853 + }, + { + "epoch": 2.9045550453022217, + "grad_norm": 0.12737767537759337, + "learning_rate": 8.70516490175393e-07, + "loss": 0.714, + "step": 5854 + }, + { + "epoch": 2.905051508005461, + "grad_norm": 0.12518922341713093, + "learning_rate": 8.697753961059385e-07, + "loss": 0.6743, + "step": 5855 + }, + { + "epoch": 2.9055479707087004, + "grad_norm": 0.13072186624967902, + "learning_rate": 8.690345511901771e-07, + "loss": 0.7178, + "step": 5856 + }, + { + "epoch": 2.9060444334119397, + "grad_norm": 0.12797230065596052, + "learning_rate": 8.682939555413334e-07, + "loss": 0.6979, + "step": 5857 + }, + { + "epoch": 2.906540896115179, + "grad_norm": 0.13125850276158907, + "learning_rate": 8.675536092725967e-07, + "loss": 0.6864, + "step": 5858 + }, + { + "epoch": 2.907037358818419, + "grad_norm": 0.12641599740685236, + "learning_rate": 8.668135124971166e-07, + "loss": 0.6801, + "step": 5859 + }, + { + "epoch": 2.907533821521658, + "grad_norm": 0.127803193061895, + "learning_rate": 8.660736653280041e-07, + "loss": 0.6713, + "step": 5860 + }, + { + "epoch": 2.9080302842248975, + "grad_norm": 0.13027630370244397, + "learning_rate": 8.653340678783347e-07, + "loss": 0.6756, + "step": 5861 + }, + { + "epoch": 2.9085267469281373, + "grad_norm": 0.12973586850281724, + "learning_rate": 8.645947202611423e-07, + "loss": 0.6992, + "step": 5862 + }, + { + "epoch": 2.9090232096313766, + "grad_norm": 0.1325303303913486, + "learning_rate": 8.638556225894271e-07, + "loss": 0.7145, + "step": 5863 + }, + { + "epoch": 2.909519672334616, + "grad_norm": 0.12990059009224683, + "learning_rate": 8.631167749761443e-07, + "loss": 0.6904, + "step": 5864 + }, + { + "epoch": 2.9100161350378553, + "grad_norm": 0.131277660839979, + "learning_rate": 8.623781775342183e-07, + "loss": 0.7115, + "step": 5865 + }, + { + "epoch": 2.9105125977410946, + "grad_norm": 0.1353236422896994, + "learning_rate": 8.616398303765303e-07, + "loss": 0.7022, + "step": 5866 + }, + { + "epoch": 2.911009060444334, + "grad_norm": 0.13235776611917602, + "learning_rate": 8.609017336159243e-07, + "loss": 0.6841, + "step": 5867 + }, + { + "epoch": 2.9115055231475733, + "grad_norm": 0.1352005618433371, + "learning_rate": 8.601638873652082e-07, + "loss": 0.7363, + "step": 5868 + }, + { + "epoch": 2.912001985850813, + "grad_norm": 0.1259476367266435, + "learning_rate": 8.594262917371485e-07, + "loss": 0.6729, + "step": 5869 + }, + { + "epoch": 2.9124984485540524, + "grad_norm": 0.1260089860773777, + "learning_rate": 8.586889468444761e-07, + "loss": 0.6744, + "step": 5870 + }, + { + "epoch": 2.9129949112572917, + "grad_norm": 0.13405412341583484, + "learning_rate": 8.57951852799882e-07, + "loss": 0.7332, + "step": 5871 + }, + { + "epoch": 2.9134913739605315, + "grad_norm": 0.12637262916741504, + "learning_rate": 8.572150097160179e-07, + "loss": 0.7096, + "step": 5872 + }, + { + "epoch": 2.913987836663771, + "grad_norm": 0.12478537511556445, + "learning_rate": 8.564784177055005e-07, + "loss": 0.6937, + "step": 5873 + }, + { + "epoch": 2.91448429936701, + "grad_norm": 0.13209726484500378, + "learning_rate": 8.557420768809041e-07, + "loss": 0.7309, + "step": 5874 + }, + { + "epoch": 2.9149807620702495, + "grad_norm": 0.13508309020119716, + "learning_rate": 8.550059873547686e-07, + "loss": 0.6698, + "step": 5875 + }, + { + "epoch": 2.915477224773489, + "grad_norm": 0.12122947126692105, + "learning_rate": 8.542701492395924e-07, + "loss": 0.6677, + "step": 5876 + }, + { + "epoch": 2.915973687476728, + "grad_norm": 0.12335372065030592, + "learning_rate": 8.535345626478355e-07, + "loss": 0.6776, + "step": 5877 + }, + { + "epoch": 2.9164701501799675, + "grad_norm": 0.12779589751503279, + "learning_rate": 8.527992276919228e-07, + "loss": 0.6466, + "step": 5878 + }, + { + "epoch": 2.9169666128832072, + "grad_norm": 0.12792448170766138, + "learning_rate": 8.520641444842373e-07, + "loss": 0.6864, + "step": 5879 + }, + { + "epoch": 2.9174630755864466, + "grad_norm": 0.12567621559611838, + "learning_rate": 8.513293131371245e-07, + "loss": 0.7195, + "step": 5880 + }, + { + "epoch": 2.917959538289686, + "grad_norm": 0.12846917163481486, + "learning_rate": 8.50594733762892e-07, + "loss": 0.7167, + "step": 5881 + }, + { + "epoch": 2.9184560009929257, + "grad_norm": 0.13195032168838147, + "learning_rate": 8.498604064738072e-07, + "loss": 0.7381, + "step": 5882 + }, + { + "epoch": 2.918952463696165, + "grad_norm": 0.13038989198058165, + "learning_rate": 8.491263313821021e-07, + "loss": 0.6773, + "step": 5883 + }, + { + "epoch": 2.9194489263994043, + "grad_norm": 0.12624725579988283, + "learning_rate": 8.483925085999667e-07, + "loss": 0.6843, + "step": 5884 + }, + { + "epoch": 2.9199453891026437, + "grad_norm": 0.1289543904324568, + "learning_rate": 8.476589382395558e-07, + "loss": 0.7618, + "step": 5885 + }, + { + "epoch": 2.920441851805883, + "grad_norm": 0.12430274931985501, + "learning_rate": 8.469256204129827e-07, + "loss": 0.6734, + "step": 5886 + }, + { + "epoch": 2.9209383145091223, + "grad_norm": 0.13183276150434192, + "learning_rate": 8.461925552323231e-07, + "loss": 0.6935, + "step": 5887 + }, + { + "epoch": 2.9214347772123617, + "grad_norm": 0.12471273296702598, + "learning_rate": 8.45459742809615e-07, + "loss": 0.6639, + "step": 5888 + }, + { + "epoch": 2.9219312399156014, + "grad_norm": 0.13312244162746337, + "learning_rate": 8.44727183256856e-07, + "loss": 0.6958, + "step": 5889 + }, + { + "epoch": 2.9224277026188408, + "grad_norm": 0.12804074982988345, + "learning_rate": 8.439948766860076e-07, + "loss": 0.7105, + "step": 5890 + }, + { + "epoch": 2.92292416532208, + "grad_norm": 0.13266023301103108, + "learning_rate": 8.432628232089904e-07, + "loss": 0.7281, + "step": 5891 + }, + { + "epoch": 2.9234206280253194, + "grad_norm": 0.12978177617631043, + "learning_rate": 8.42531022937686e-07, + "loss": 0.6977, + "step": 5892 + }, + { + "epoch": 2.923917090728559, + "grad_norm": 0.1309078003351543, + "learning_rate": 8.417994759839401e-07, + "loss": 0.6945, + "step": 5893 + }, + { + "epoch": 2.9244135534317985, + "grad_norm": 0.12245231712292412, + "learning_rate": 8.41068182459556e-07, + "loss": 0.6611, + "step": 5894 + }, + { + "epoch": 2.924910016135038, + "grad_norm": 0.1281010559347382, + "learning_rate": 8.403371424763032e-07, + "loss": 0.6796, + "step": 5895 + }, + { + "epoch": 2.925406478838277, + "grad_norm": 0.13285716334194994, + "learning_rate": 8.396063561459058e-07, + "loss": 0.7713, + "step": 5896 + }, + { + "epoch": 2.9259029415415165, + "grad_norm": 0.12525104544597734, + "learning_rate": 8.388758235800551e-07, + "loss": 0.669, + "step": 5897 + }, + { + "epoch": 2.926399404244756, + "grad_norm": 0.12692092156468257, + "learning_rate": 8.381455448904008e-07, + "loss": 0.716, + "step": 5898 + }, + { + "epoch": 2.9268958669479956, + "grad_norm": 0.12577897045335742, + "learning_rate": 8.374155201885533e-07, + "loss": 0.7094, + "step": 5899 + }, + { + "epoch": 2.927392329651235, + "grad_norm": 0.1270829254888127, + "learning_rate": 8.366857495860869e-07, + "loss": 0.7035, + "step": 5900 + }, + { + "epoch": 2.9278887923544743, + "grad_norm": 0.134187041399689, + "learning_rate": 8.359562331945337e-07, + "loss": 0.7231, + "step": 5901 + }, + { + "epoch": 2.9283852550577136, + "grad_norm": 0.13133787410693823, + "learning_rate": 8.352269711253902e-07, + "loss": 0.7019, + "step": 5902 + }, + { + "epoch": 2.9288817177609534, + "grad_norm": 0.13216722658500193, + "learning_rate": 8.344979634901115e-07, + "loss": 0.7305, + "step": 5903 + }, + { + "epoch": 2.9293781804641927, + "grad_norm": 0.12991923696083893, + "learning_rate": 8.337692104001138e-07, + "loss": 0.72, + "step": 5904 + }, + { + "epoch": 2.929874643167432, + "grad_norm": 0.13021761831749384, + "learning_rate": 8.330407119667775e-07, + "loss": 0.7079, + "step": 5905 + }, + { + "epoch": 2.9303711058706714, + "grad_norm": 0.12848310312169373, + "learning_rate": 8.323124683014394e-07, + "loss": 0.7022, + "step": 5906 + }, + { + "epoch": 2.9308675685739107, + "grad_norm": 0.12155773897471595, + "learning_rate": 8.315844795154024e-07, + "loss": 0.7003, + "step": 5907 + }, + { + "epoch": 2.93136403127715, + "grad_norm": 0.12783767747738128, + "learning_rate": 8.308567457199265e-07, + "loss": 0.6956, + "step": 5908 + }, + { + "epoch": 2.93186049398039, + "grad_norm": 0.12517915661300882, + "learning_rate": 8.301292670262334e-07, + "loss": 0.6805, + "step": 5909 + }, + { + "epoch": 2.932356956683629, + "grad_norm": 0.12818781769460139, + "learning_rate": 8.294020435455085e-07, + "loss": 0.6837, + "step": 5910 + }, + { + "epoch": 2.9328534193868685, + "grad_norm": 0.12740869036254676, + "learning_rate": 8.286750753888953e-07, + "loss": 0.7211, + "step": 5911 + }, + { + "epoch": 2.933349882090108, + "grad_norm": 0.12327113825475457, + "learning_rate": 8.279483626674992e-07, + "loss": 0.6925, + "step": 5912 + }, + { + "epoch": 2.9338463447933476, + "grad_norm": 0.12387658865943076, + "learning_rate": 8.272219054923855e-07, + "loss": 0.6407, + "step": 5913 + }, + { + "epoch": 2.934342807496587, + "grad_norm": 0.12861942462803927, + "learning_rate": 8.264957039745835e-07, + "loss": 0.6996, + "step": 5914 + }, + { + "epoch": 2.9348392701998263, + "grad_norm": 0.12836506417919924, + "learning_rate": 8.257697582250807e-07, + "loss": 0.7062, + "step": 5915 + }, + { + "epoch": 2.9353357329030656, + "grad_norm": 0.13217483481431683, + "learning_rate": 8.250440683548253e-07, + "loss": 0.7346, + "step": 5916 + }, + { + "epoch": 2.935832195606305, + "grad_norm": 0.12771709009606644, + "learning_rate": 8.243186344747286e-07, + "loss": 0.7331, + "step": 5917 + }, + { + "epoch": 2.9363286583095443, + "grad_norm": 0.12511348202532716, + "learning_rate": 8.235934566956616e-07, + "loss": 0.6774, + "step": 5918 + }, + { + "epoch": 2.936825121012784, + "grad_norm": 0.1304719690078209, + "learning_rate": 8.228685351284547e-07, + "loss": 0.7227, + "step": 5919 + }, + { + "epoch": 2.9373215837160234, + "grad_norm": 0.12839066350051087, + "learning_rate": 8.221438698839021e-07, + "loss": 0.6943, + "step": 5920 + }, + { + "epoch": 2.9378180464192627, + "grad_norm": 0.12944145552846098, + "learning_rate": 8.214194610727557e-07, + "loss": 0.7047, + "step": 5921 + }, + { + "epoch": 2.938314509122502, + "grad_norm": 0.12851146335009603, + "learning_rate": 8.206953088057318e-07, + "loss": 0.7166, + "step": 5922 + }, + { + "epoch": 2.938810971825742, + "grad_norm": 0.13209306286470085, + "learning_rate": 8.199714131935041e-07, + "loss": 0.676, + "step": 5923 + }, + { + "epoch": 2.939307434528981, + "grad_norm": 0.13328728796070516, + "learning_rate": 8.192477743467078e-07, + "loss": 0.6941, + "step": 5924 + }, + { + "epoch": 2.9398038972322205, + "grad_norm": 0.12676782708503537, + "learning_rate": 8.185243923759414e-07, + "loss": 0.716, + "step": 5925 + }, + { + "epoch": 2.94030035993546, + "grad_norm": 0.12845269590237002, + "learning_rate": 8.178012673917604e-07, + "loss": 0.6856, + "step": 5926 + }, + { + "epoch": 2.940796822638699, + "grad_norm": 0.12791983707358917, + "learning_rate": 8.170783995046852e-07, + "loss": 0.7007, + "step": 5927 + }, + { + "epoch": 2.9412932853419385, + "grad_norm": 0.12653344264597674, + "learning_rate": 8.163557888251916e-07, + "loss": 0.6654, + "step": 5928 + }, + { + "epoch": 2.9417897480451782, + "grad_norm": 0.1311894343517124, + "learning_rate": 8.15633435463721e-07, + "loss": 0.6941, + "step": 5929 + }, + { + "epoch": 2.9422862107484176, + "grad_norm": 0.13045379771212226, + "learning_rate": 8.149113395306732e-07, + "loss": 0.7029, + "step": 5930 + }, + { + "epoch": 2.942782673451657, + "grad_norm": 0.1310448907057462, + "learning_rate": 8.141895011364082e-07, + "loss": 0.6971, + "step": 5931 + }, + { + "epoch": 2.9432791361548962, + "grad_norm": 0.12684756193510152, + "learning_rate": 8.134679203912485e-07, + "loss": 0.6932, + "step": 5932 + }, + { + "epoch": 2.943775598858136, + "grad_norm": 0.13624799129435985, + "learning_rate": 8.127465974054749e-07, + "loss": 0.7143, + "step": 5933 + }, + { + "epoch": 2.9442720615613753, + "grad_norm": 0.13275208970745384, + "learning_rate": 8.120255322893319e-07, + "loss": 0.7652, + "step": 5934 + }, + { + "epoch": 2.9447685242646147, + "grad_norm": 0.13231133707193993, + "learning_rate": 8.113047251530215e-07, + "loss": 0.7276, + "step": 5935 + }, + { + "epoch": 2.945264986967854, + "grad_norm": 0.12564669641658913, + "learning_rate": 8.105841761067068e-07, + "loss": 0.6761, + "step": 5936 + }, + { + "epoch": 2.9457614496710933, + "grad_norm": 0.12956694166664687, + "learning_rate": 8.098638852605139e-07, + "loss": 0.6975, + "step": 5937 + }, + { + "epoch": 2.9462579123743327, + "grad_norm": 0.12673479579130228, + "learning_rate": 8.091438527245263e-07, + "loss": 0.659, + "step": 5938 + }, + { + "epoch": 2.9467543750775724, + "grad_norm": 0.12543122612508772, + "learning_rate": 8.084240786087907e-07, + "loss": 0.7008, + "step": 5939 + }, + { + "epoch": 2.9472508377808118, + "grad_norm": 0.13367896037829807, + "learning_rate": 8.077045630233121e-07, + "loss": 0.7256, + "step": 5940 + }, + { + "epoch": 2.947747300484051, + "grad_norm": 0.13359993578888146, + "learning_rate": 8.069853060780566e-07, + "loss": 0.6878, + "step": 5941 + }, + { + "epoch": 2.9482437631872904, + "grad_norm": 0.12701732001687444, + "learning_rate": 8.062663078829524e-07, + "loss": 0.7399, + "step": 5942 + }, + { + "epoch": 2.94874022589053, + "grad_norm": 0.12747058578699347, + "learning_rate": 8.055475685478861e-07, + "loss": 0.6999, + "step": 5943 + }, + { + "epoch": 2.9492366885937695, + "grad_norm": 0.12657957164210343, + "learning_rate": 8.048290881827053e-07, + "loss": 0.7354, + "step": 5944 + }, + { + "epoch": 2.949733151297009, + "grad_norm": 0.12534187082541948, + "learning_rate": 8.041108668972178e-07, + "loss": 0.6951, + "step": 5945 + }, + { + "epoch": 2.950229614000248, + "grad_norm": 0.12398448643069547, + "learning_rate": 8.033929048011934e-07, + "loss": 0.6824, + "step": 5946 + }, + { + "epoch": 2.9507260767034875, + "grad_norm": 0.13140028368522533, + "learning_rate": 8.026752020043604e-07, + "loss": 0.6993, + "step": 5947 + }, + { + "epoch": 2.951222539406727, + "grad_norm": 0.1339567215216009, + "learning_rate": 8.019577586164077e-07, + "loss": 0.6883, + "step": 5948 + }, + { + "epoch": 2.9517190021099666, + "grad_norm": 0.13254826072565667, + "learning_rate": 8.012405747469861e-07, + "loss": 0.7031, + "step": 5949 + }, + { + "epoch": 2.952215464813206, + "grad_norm": 0.13203437830444636, + "learning_rate": 8.005236505057045e-07, + "loss": 0.6924, + "step": 5950 + }, + { + "epoch": 2.9527119275164453, + "grad_norm": 0.13328318102042208, + "learning_rate": 7.998069860021346e-07, + "loss": 0.735, + "step": 5951 + }, + { + "epoch": 2.9532083902196846, + "grad_norm": 0.12989959931327805, + "learning_rate": 7.990905813458063e-07, + "loss": 0.7524, + "step": 5952 + }, + { + "epoch": 2.9537048529229244, + "grad_norm": 0.12932924465141693, + "learning_rate": 7.983744366462101e-07, + "loss": 0.7075, + "step": 5953 + }, + { + "epoch": 2.9542013156261637, + "grad_norm": 0.1270536630172262, + "learning_rate": 7.976585520127982e-07, + "loss": 0.6864, + "step": 5954 + }, + { + "epoch": 2.954697778329403, + "grad_norm": 0.13045118616961768, + "learning_rate": 7.969429275549819e-07, + "loss": 0.6788, + "step": 5955 + }, + { + "epoch": 2.9551942410326424, + "grad_norm": 0.1273750155772813, + "learning_rate": 7.962275633821321e-07, + "loss": 0.7083, + "step": 5956 + }, + { + "epoch": 2.9556907037358817, + "grad_norm": 0.129306537947579, + "learning_rate": 7.955124596035818e-07, + "loss": 0.7025, + "step": 5957 + }, + { + "epoch": 2.956187166439121, + "grad_norm": 0.13380324003505628, + "learning_rate": 7.947976163286219e-07, + "loss": 0.7551, + "step": 5958 + }, + { + "epoch": 2.956683629142361, + "grad_norm": 0.1316194370579435, + "learning_rate": 7.940830336665071e-07, + "loss": 0.7246, + "step": 5959 + }, + { + "epoch": 2.9571800918456, + "grad_norm": 0.12465311080275596, + "learning_rate": 7.933687117264469e-07, + "loss": 0.6732, + "step": 5960 + }, + { + "epoch": 2.9576765545488395, + "grad_norm": 0.1308457845292269, + "learning_rate": 7.926546506176158e-07, + "loss": 0.7373, + "step": 5961 + }, + { + "epoch": 2.958173017252079, + "grad_norm": 0.13252382874716395, + "learning_rate": 7.919408504491464e-07, + "loss": 0.7194, + "step": 5962 + }, + { + "epoch": 2.9586694799553186, + "grad_norm": 0.12466359487318711, + "learning_rate": 7.912273113301306e-07, + "loss": 0.6609, + "step": 5963 + }, + { + "epoch": 2.959165942658558, + "grad_norm": 0.13237910505477793, + "learning_rate": 7.905140333696229e-07, + "loss": 0.7391, + "step": 5964 + }, + { + "epoch": 2.9596624053617973, + "grad_norm": 0.13205570481341808, + "learning_rate": 7.898010166766348e-07, + "loss": 0.7033, + "step": 5965 + }, + { + "epoch": 2.9601588680650366, + "grad_norm": 0.12754936984318843, + "learning_rate": 7.890882613601411e-07, + "loss": 0.7007, + "step": 5966 + }, + { + "epoch": 2.960655330768276, + "grad_norm": 0.12850522933784753, + "learning_rate": 7.883757675290746e-07, + "loss": 0.7058, + "step": 5967 + }, + { + "epoch": 2.9611517934715152, + "grad_norm": 0.12454681696499198, + "learning_rate": 7.87663535292327e-07, + "loss": 0.6232, + "step": 5968 + }, + { + "epoch": 2.961648256174755, + "grad_norm": 0.12513438738198798, + "learning_rate": 7.869515647587539e-07, + "loss": 0.6939, + "step": 5969 + }, + { + "epoch": 2.9621447188779944, + "grad_norm": 0.12503092799347673, + "learning_rate": 7.862398560371665e-07, + "loss": 0.6937, + "step": 5970 + }, + { + "epoch": 2.9626411815812337, + "grad_norm": 0.1313520263634869, + "learning_rate": 7.855284092363399e-07, + "loss": 0.7132, + "step": 5971 + }, + { + "epoch": 2.963137644284473, + "grad_norm": 0.1324830049247509, + "learning_rate": 7.848172244650065e-07, + "loss": 0.708, + "step": 5972 + }, + { + "epoch": 2.963634106987713, + "grad_norm": 0.13238004154599406, + "learning_rate": 7.841063018318587e-07, + "loss": 0.7363, + "step": 5973 + }, + { + "epoch": 2.964130569690952, + "grad_norm": 0.12851537121562095, + "learning_rate": 7.833956414455512e-07, + "loss": 0.7088, + "step": 5974 + }, + { + "epoch": 2.9646270323941915, + "grad_norm": 0.12750870312778442, + "learning_rate": 7.826852434146964e-07, + "loss": 0.6739, + "step": 5975 + }, + { + "epoch": 2.965123495097431, + "grad_norm": 0.13032488783779714, + "learning_rate": 7.819751078478669e-07, + "loss": 0.6985, + "step": 5976 + }, + { + "epoch": 2.96561995780067, + "grad_norm": 0.1280481984781406, + "learning_rate": 7.812652348535948e-07, + "loss": 0.7029, + "step": 5977 + }, + { + "epoch": 2.9661164205039094, + "grad_norm": 0.12481852362878608, + "learning_rate": 7.805556245403748e-07, + "loss": 0.6592, + "step": 5978 + }, + { + "epoch": 2.966612883207149, + "grad_norm": 0.12694865638365466, + "learning_rate": 7.798462770166585e-07, + "loss": 0.6451, + "step": 5979 + }, + { + "epoch": 2.9671093459103886, + "grad_norm": 0.12968124976820739, + "learning_rate": 7.791371923908569e-07, + "loss": 0.7499, + "step": 5980 + }, + { + "epoch": 2.967605808613628, + "grad_norm": 0.12879024482404638, + "learning_rate": 7.784283707713447e-07, + "loss": 0.7268, + "step": 5981 + }, + { + "epoch": 2.968102271316867, + "grad_norm": 0.13142308258429045, + "learning_rate": 7.77719812266452e-07, + "loss": 0.7254, + "step": 5982 + }, + { + "epoch": 2.968598734020107, + "grad_norm": 0.12715116614343377, + "learning_rate": 7.770115169844719e-07, + "loss": 0.6715, + "step": 5983 + }, + { + "epoch": 2.9690951967233463, + "grad_norm": 0.12867668360826426, + "learning_rate": 7.763034850336554e-07, + "loss": 0.7037, + "step": 5984 + }, + { + "epoch": 2.9695916594265857, + "grad_norm": 0.13174571937124394, + "learning_rate": 7.75595716522213e-07, + "loss": 0.7087, + "step": 5985 + }, + { + "epoch": 2.970088122129825, + "grad_norm": 0.12868392659804886, + "learning_rate": 7.748882115583175e-07, + "loss": 0.7026, + "step": 5986 + }, + { + "epoch": 2.9705845848330643, + "grad_norm": 0.1304279274224821, + "learning_rate": 7.741809702500983e-07, + "loss": 0.68, + "step": 5987 + }, + { + "epoch": 2.9710810475363036, + "grad_norm": 0.13024362648385276, + "learning_rate": 7.734739927056467e-07, + "loss": 0.7152, + "step": 5988 + }, + { + "epoch": 2.9715775102395434, + "grad_norm": 0.13003977622755503, + "learning_rate": 7.727672790330129e-07, + "loss": 0.6805, + "step": 5989 + }, + { + "epoch": 2.9720739729427827, + "grad_norm": 0.1307381982553933, + "learning_rate": 7.720608293402055e-07, + "loss": 0.7367, + "step": 5990 + }, + { + "epoch": 2.972570435646022, + "grad_norm": 0.13211408639458272, + "learning_rate": 7.713546437351965e-07, + "loss": 0.7141, + "step": 5991 + }, + { + "epoch": 2.9730668983492614, + "grad_norm": 0.1247955923407189, + "learning_rate": 7.706487223259121e-07, + "loss": 0.6826, + "step": 5992 + }, + { + "epoch": 2.973563361052501, + "grad_norm": 0.13116304767044074, + "learning_rate": 7.69943065220243e-07, + "loss": 0.7277, + "step": 5993 + }, + { + "epoch": 2.9740598237557405, + "grad_norm": 0.13479680364255306, + "learning_rate": 7.692376725260369e-07, + "loss": 0.6846, + "step": 5994 + }, + { + "epoch": 2.97455628645898, + "grad_norm": 0.12782009680839385, + "learning_rate": 7.685325443511015e-07, + "loss": 0.6846, + "step": 5995 + }, + { + "epoch": 2.975052749162219, + "grad_norm": 0.1336362278759992, + "learning_rate": 7.678276808032054e-07, + "loss": 0.7207, + "step": 5996 + }, + { + "epoch": 2.9755492118654585, + "grad_norm": 0.12954084216492223, + "learning_rate": 7.671230819900741e-07, + "loss": 0.722, + "step": 5997 + }, + { + "epoch": 2.976045674568698, + "grad_norm": 0.12870683674641, + "learning_rate": 7.66418748019396e-07, + "loss": 0.6928, + "step": 5998 + }, + { + "epoch": 2.976542137271937, + "grad_norm": 0.1283984345940394, + "learning_rate": 7.657146789988165e-07, + "loss": 0.6856, + "step": 5999 + }, + { + "epoch": 2.977038599975177, + "grad_norm": 0.12793776246424288, + "learning_rate": 7.650108750359403e-07, + "loss": 0.6919, + "step": 6000 + }, + { + "epoch": 2.9775350626784163, + "grad_norm": 0.12577650631266427, + "learning_rate": 7.643073362383341e-07, + "loss": 0.6853, + "step": 6001 + }, + { + "epoch": 2.9780315253816556, + "grad_norm": 0.12392465457564755, + "learning_rate": 7.636040627135211e-07, + "loss": 0.722, + "step": 6002 + }, + { + "epoch": 2.9785279880848954, + "grad_norm": 0.12418251160738356, + "learning_rate": 7.629010545689869e-07, + "loss": 0.7056, + "step": 6003 + }, + { + "epoch": 2.9790244507881347, + "grad_norm": 0.1271367787777443, + "learning_rate": 7.621983119121742e-07, + "loss": 0.6494, + "step": 6004 + }, + { + "epoch": 2.979520913491374, + "grad_norm": 0.12870235470062621, + "learning_rate": 7.614958348504853e-07, + "loss": 0.7274, + "step": 6005 + }, + { + "epoch": 2.9800173761946134, + "grad_norm": 0.12786870456842656, + "learning_rate": 7.607936234912841e-07, + "loss": 0.6715, + "step": 6006 + }, + { + "epoch": 2.9805138388978527, + "grad_norm": 0.12959870966522002, + "learning_rate": 7.600916779418916e-07, + "loss": 0.7442, + "step": 6007 + }, + { + "epoch": 2.981010301601092, + "grad_norm": 0.12795538478138033, + "learning_rate": 7.593899983095884e-07, + "loss": 0.6919, + "step": 6008 + }, + { + "epoch": 2.9815067643043314, + "grad_norm": 0.14008517233459958, + "learning_rate": 7.586885847016148e-07, + "loss": 0.7013, + "step": 6009 + }, + { + "epoch": 2.982003227007571, + "grad_norm": 0.1273352199473159, + "learning_rate": 7.579874372251722e-07, + "loss": 0.7015, + "step": 6010 + }, + { + "epoch": 2.9824996897108105, + "grad_norm": 0.12663695622291235, + "learning_rate": 7.572865559874188e-07, + "loss": 0.7045, + "step": 6011 + }, + { + "epoch": 2.98299615241405, + "grad_norm": 0.1309777399910531, + "learning_rate": 7.565859410954718e-07, + "loss": 0.7366, + "step": 6012 + }, + { + "epoch": 2.9834926151172896, + "grad_norm": 0.1281630321138167, + "learning_rate": 7.558855926564112e-07, + "loss": 0.7116, + "step": 6013 + }, + { + "epoch": 2.983989077820529, + "grad_norm": 0.1280931136947841, + "learning_rate": 7.551855107772724e-07, + "loss": 0.7073, + "step": 6014 + }, + { + "epoch": 2.9844855405237682, + "grad_norm": 0.12459877040568573, + "learning_rate": 7.544856955650532e-07, + "loss": 0.6964, + "step": 6015 + }, + { + "epoch": 2.9849820032270076, + "grad_norm": 0.12667593238205743, + "learning_rate": 7.537861471267077e-07, + "loss": 0.6639, + "step": 6016 + }, + { + "epoch": 2.985478465930247, + "grad_norm": 0.12902244841475374, + "learning_rate": 7.530868655691509e-07, + "loss": 0.7196, + "step": 6017 + }, + { + "epoch": 2.9859749286334862, + "grad_norm": 0.12779617048573458, + "learning_rate": 7.523878509992578e-07, + "loss": 0.6979, + "step": 6018 + }, + { + "epoch": 2.9864713913367256, + "grad_norm": 0.12907365853546054, + "learning_rate": 7.516891035238596e-07, + "loss": 0.682, + "step": 6019 + }, + { + "epoch": 2.9869678540399653, + "grad_norm": 0.12929753241306882, + "learning_rate": 7.509906232497513e-07, + "loss": 0.7007, + "step": 6020 + }, + { + "epoch": 2.9874643167432047, + "grad_norm": 0.12701020359185758, + "learning_rate": 7.502924102836826e-07, + "loss": 0.6893, + "step": 6021 + }, + { + "epoch": 2.987960779446444, + "grad_norm": 0.1237167641591724, + "learning_rate": 7.495944647323639e-07, + "loss": 0.709, + "step": 6022 + }, + { + "epoch": 2.988457242149684, + "grad_norm": 0.13193601872797872, + "learning_rate": 7.488967867024671e-07, + "loss": 0.7199, + "step": 6023 + }, + { + "epoch": 2.988953704852923, + "grad_norm": 0.12998564461199474, + "learning_rate": 7.481993763006184e-07, + "loss": 0.6595, + "step": 6024 + }, + { + "epoch": 2.9894501675561624, + "grad_norm": 0.13111171647856992, + "learning_rate": 7.475022336334075e-07, + "loss": 0.6868, + "step": 6025 + }, + { + "epoch": 2.9899466302594018, + "grad_norm": 0.12835689645376358, + "learning_rate": 7.468053588073803e-07, + "loss": 0.7071, + "step": 6026 + }, + { + "epoch": 2.990443092962641, + "grad_norm": 0.1259901601242702, + "learning_rate": 7.461087519290447e-07, + "loss": 0.7059, + "step": 6027 + }, + { + "epoch": 2.9909395556658804, + "grad_norm": 0.12268405384296684, + "learning_rate": 7.454124131048646e-07, + "loss": 0.6555, + "step": 6028 + }, + { + "epoch": 2.9914360183691198, + "grad_norm": 0.13427530622463077, + "learning_rate": 7.447163424412638e-07, + "loss": 0.7329, + "step": 6029 + }, + { + "epoch": 2.9919324810723595, + "grad_norm": 0.12630933697427535, + "learning_rate": 7.440205400446271e-07, + "loss": 0.7285, + "step": 6030 + }, + { + "epoch": 2.992428943775599, + "grad_norm": 0.1282172265353785, + "learning_rate": 7.433250060212957e-07, + "loss": 0.6894, + "step": 6031 + }, + { + "epoch": 2.992925406478838, + "grad_norm": 0.1242625795254969, + "learning_rate": 7.426297404775701e-07, + "loss": 0.6954, + "step": 6032 + }, + { + "epoch": 2.9934218691820775, + "grad_norm": 0.12573869185290845, + "learning_rate": 7.419347435197125e-07, + "loss": 0.6515, + "step": 6033 + }, + { + "epoch": 2.9939183318853173, + "grad_norm": 0.12632264327088447, + "learning_rate": 7.412400152539398e-07, + "loss": 0.6976, + "step": 6034 + }, + { + "epoch": 2.9944147945885566, + "grad_norm": 0.12492304063826659, + "learning_rate": 7.405455557864322e-07, + "loss": 0.6691, + "step": 6035 + }, + { + "epoch": 2.994911257291796, + "grad_norm": 0.12637656925192833, + "learning_rate": 7.398513652233255e-07, + "loss": 0.7189, + "step": 6036 + }, + { + "epoch": 2.9954077199950353, + "grad_norm": 0.12460961740826068, + "learning_rate": 7.391574436707153e-07, + "loss": 0.6838, + "step": 6037 + }, + { + "epoch": 2.9959041826982746, + "grad_norm": 0.13406612342755167, + "learning_rate": 7.384637912346573e-07, + "loss": 0.7276, + "step": 6038 + }, + { + "epoch": 2.996400645401514, + "grad_norm": 0.12791763802640216, + "learning_rate": 7.377704080211651e-07, + "loss": 0.7028, + "step": 6039 + }, + { + "epoch": 2.9968971081047537, + "grad_norm": 0.12705616081597926, + "learning_rate": 7.370772941362106e-07, + "loss": 0.7045, + "step": 6040 + }, + { + "epoch": 2.997393570807993, + "grad_norm": 0.13088558569032857, + "learning_rate": 7.363844496857247e-07, + "loss": 0.6877, + "step": 6041 + }, + { + "epoch": 2.9978900335112324, + "grad_norm": 0.12444783921112863, + "learning_rate": 7.356918747755989e-07, + "loss": 0.688, + "step": 6042 + } + ], + "logging_steps": 1, + "max_steps": 8056, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 2014, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1358095334768640.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}