{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9986348122866895, "eval_steps": 500, "global_step": 366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005460750853242321, "grad_norm": 3.971062381322324, "learning_rate": 2.702702702702703e-07, "loss": 0.5016, "step": 1 }, { "epoch": 0.010921501706484642, "grad_norm": 4.14826323264813, "learning_rate": 5.405405405405406e-07, "loss": 0.5208, "step": 2 }, { "epoch": 0.016382252559726963, "grad_norm": 3.745507693250097, "learning_rate": 8.108108108108109e-07, "loss": 0.4379, "step": 3 }, { "epoch": 0.021843003412969283, "grad_norm": 3.8553920899968372, "learning_rate": 1.0810810810810812e-06, "loss": 0.4602, "step": 4 }, { "epoch": 0.027303754266211604, "grad_norm": 4.006658163193665, "learning_rate": 1.3513513513513515e-06, "loss": 0.4578, "step": 5 }, { "epoch": 0.032764505119453925, "grad_norm": 3.0693234037079433, "learning_rate": 1.6216216216216219e-06, "loss": 0.4416, "step": 6 }, { "epoch": 0.03822525597269624, "grad_norm": 2.3221220798320386, "learning_rate": 1.8918918918918922e-06, "loss": 0.4635, "step": 7 }, { "epoch": 0.04368600682593857, "grad_norm": 1.9622941289180669, "learning_rate": 2.1621621621621623e-06, "loss": 0.4471, "step": 8 }, { "epoch": 0.049146757679180884, "grad_norm": 1.5785046223376498, "learning_rate": 2.432432432432433e-06, "loss": 0.3669, "step": 9 }, { "epoch": 0.05460750853242321, "grad_norm": 1.592167130140165, "learning_rate": 2.702702702702703e-06, "loss": 0.4232, "step": 10 }, { "epoch": 0.060068259385665526, "grad_norm": 1.4756853657479083, "learning_rate": 2.9729729729729736e-06, "loss": 0.4105, "step": 11 }, { "epoch": 0.06552901023890785, "grad_norm": 1.327708040902143, "learning_rate": 3.2432432432432437e-06, "loss": 0.386, "step": 12 }, { "epoch": 0.07098976109215017, "grad_norm": 1.5458703208762807, "learning_rate": 3.513513513513514e-06, "loss": 0.405, "step": 13 }, { "epoch": 0.07645051194539249, "grad_norm": 1.6200361209703527, "learning_rate": 3.7837837837837844e-06, "loss": 0.3908, "step": 14 }, { "epoch": 0.08191126279863481, "grad_norm": 1.5715327605819764, "learning_rate": 4.0540540540540545e-06, "loss": 0.3972, "step": 15 }, { "epoch": 0.08737201365187713, "grad_norm": 1.2301811554389595, "learning_rate": 4.324324324324325e-06, "loss": 0.3739, "step": 16 }, { "epoch": 0.09283276450511946, "grad_norm": 1.0413606131616007, "learning_rate": 4.594594594594596e-06, "loss": 0.3521, "step": 17 }, { "epoch": 0.09829351535836177, "grad_norm": 1.1239884207253636, "learning_rate": 4.864864864864866e-06, "loss": 0.4348, "step": 18 }, { "epoch": 0.1037542662116041, "grad_norm": 1.1123432515923368, "learning_rate": 5.135135135135135e-06, "loss": 0.3949, "step": 19 }, { "epoch": 0.10921501706484642, "grad_norm": 1.0996938196641266, "learning_rate": 5.405405405405406e-06, "loss": 0.3775, "step": 20 }, { "epoch": 0.11467576791808874, "grad_norm": 1.0868866085505373, "learning_rate": 5.675675675675676e-06, "loss": 0.3721, "step": 21 }, { "epoch": 0.12013651877133105, "grad_norm": 1.10559810934531, "learning_rate": 5.945945945945947e-06, "loss": 0.3884, "step": 22 }, { "epoch": 0.12559726962457338, "grad_norm": 1.0187684787484814, "learning_rate": 6.2162162162162164e-06, "loss": 0.394, "step": 23 }, { "epoch": 0.1310580204778157, "grad_norm": 0.9515401547070604, "learning_rate": 6.486486486486487e-06, "loss": 0.3664, "step": 24 }, { "epoch": 0.13651877133105803, "grad_norm": 0.9873275768348283, "learning_rate": 6.7567567567567575e-06, "loss": 0.3872, "step": 25 }, { "epoch": 0.14197952218430035, "grad_norm": 0.9420302821261468, "learning_rate": 7.027027027027028e-06, "loss": 0.3926, "step": 26 }, { "epoch": 0.14744027303754267, "grad_norm": 0.8628951646680264, "learning_rate": 7.297297297297298e-06, "loss": 0.3395, "step": 27 }, { "epoch": 0.15290102389078497, "grad_norm": 0.8883050014456254, "learning_rate": 7.567567567567569e-06, "loss": 0.3692, "step": 28 }, { "epoch": 0.1583617747440273, "grad_norm": 0.9314104245334247, "learning_rate": 7.837837837837838e-06, "loss": 0.3562, "step": 29 }, { "epoch": 0.16382252559726962, "grad_norm": 0.8388999546883599, "learning_rate": 8.108108108108109e-06, "loss": 0.3291, "step": 30 }, { "epoch": 0.16928327645051194, "grad_norm": 0.9110394289660935, "learning_rate": 8.378378378378378e-06, "loss": 0.3761, "step": 31 }, { "epoch": 0.17474402730375427, "grad_norm": 0.8529619793059433, "learning_rate": 8.64864864864865e-06, "loss": 0.3634, "step": 32 }, { "epoch": 0.1802047781569966, "grad_norm": 1.034615680095172, "learning_rate": 8.91891891891892e-06, "loss": 0.4073, "step": 33 }, { "epoch": 0.18566552901023892, "grad_norm": 0.9654399340446536, "learning_rate": 9.189189189189191e-06, "loss": 0.3832, "step": 34 }, { "epoch": 0.19112627986348124, "grad_norm": 0.8266008406999349, "learning_rate": 9.45945945945946e-06, "loss": 0.366, "step": 35 }, { "epoch": 0.19658703071672354, "grad_norm": 1.0298041047732736, "learning_rate": 9.729729729729732e-06, "loss": 0.3828, "step": 36 }, { "epoch": 0.20204778156996586, "grad_norm": 1.0253984164765952, "learning_rate": 1e-05, "loss": 0.4253, "step": 37 }, { "epoch": 0.2075085324232082, "grad_norm": 0.845565254392157, "learning_rate": 9.999772047343259e-06, "loss": 0.3426, "step": 38 }, { "epoch": 0.2129692832764505, "grad_norm": 0.9194984294487474, "learning_rate": 9.999088210158001e-06, "loss": 0.343, "step": 39 }, { "epoch": 0.21843003412969283, "grad_norm": 0.8148322661922577, "learning_rate": 9.997948550797227e-06, "loss": 0.325, "step": 40 }, { "epoch": 0.22389078498293516, "grad_norm": 0.9623977105015672, "learning_rate": 9.99635317317629e-06, "loss": 0.385, "step": 41 }, { "epoch": 0.22935153583617748, "grad_norm": 0.814300333357098, "learning_rate": 9.994302222763415e-06, "loss": 0.3462, "step": 42 }, { "epoch": 0.2348122866894198, "grad_norm": 0.8725030955526336, "learning_rate": 9.991795886566443e-06, "loss": 0.3401, "step": 43 }, { "epoch": 0.2402730375426621, "grad_norm": 0.9974480242764955, "learning_rate": 9.988834393115768e-06, "loss": 0.3424, "step": 44 }, { "epoch": 0.24573378839590443, "grad_norm": 0.8805730275929089, "learning_rate": 9.98541801244351e-06, "loss": 0.3742, "step": 45 }, { "epoch": 0.25119453924914675, "grad_norm": 0.8332420506302001, "learning_rate": 9.981547056058893e-06, "loss": 0.3435, "step": 46 }, { "epoch": 0.2566552901023891, "grad_norm": 0.9445729244701234, "learning_rate": 9.977221876919833e-06, "loss": 0.3442, "step": 47 }, { "epoch": 0.2621160409556314, "grad_norm": 0.859922027597315, "learning_rate": 9.97244286940076e-06, "loss": 0.358, "step": 48 }, { "epoch": 0.2675767918088737, "grad_norm": 0.8022442917536148, "learning_rate": 9.967210469256657e-06, "loss": 0.3329, "step": 49 }, { "epoch": 0.27303754266211605, "grad_norm": 0.8369993999252197, "learning_rate": 9.961525153583327e-06, "loss": 0.3474, "step": 50 }, { "epoch": 0.2784982935153584, "grad_norm": 0.8719055464818419, "learning_rate": 9.955387440773902e-06, "loss": 0.3364, "step": 51 }, { "epoch": 0.2839590443686007, "grad_norm": 0.9269845480680101, "learning_rate": 9.948797890471552e-06, "loss": 0.3684, "step": 52 }, { "epoch": 0.289419795221843, "grad_norm": 0.8246571303849338, "learning_rate": 9.94175710351848e-06, "loss": 0.3564, "step": 53 }, { "epoch": 0.29488054607508535, "grad_norm": 0.9162125135698432, "learning_rate": 9.93426572190112e-06, "loss": 0.3526, "step": 54 }, { "epoch": 0.3003412969283277, "grad_norm": 0.9689985766932336, "learning_rate": 9.926324428691612e-06, "loss": 0.3825, "step": 55 }, { "epoch": 0.30580204778156994, "grad_norm": 0.9203465649703365, "learning_rate": 9.917933947985508e-06, "loss": 0.3492, "step": 56 }, { "epoch": 0.31126279863481227, "grad_norm": 0.810691112658576, "learning_rate": 9.909095044835755e-06, "loss": 0.3147, "step": 57 }, { "epoch": 0.3167235494880546, "grad_norm": 0.8980168883992854, "learning_rate": 9.899808525182935e-06, "loss": 0.3351, "step": 58 }, { "epoch": 0.3221843003412969, "grad_norm": 0.8843165617295874, "learning_rate": 9.89007523578178e-06, "loss": 0.3452, "step": 59 }, { "epoch": 0.32764505119453924, "grad_norm": 0.8660715276442186, "learning_rate": 9.879896064123961e-06, "loss": 0.3601, "step": 60 }, { "epoch": 0.33310580204778156, "grad_norm": 0.8638898824914902, "learning_rate": 9.869271938357168e-06, "loss": 0.3565, "step": 61 }, { "epoch": 0.3385665529010239, "grad_norm": 0.8349466672789928, "learning_rate": 9.858203827200477e-06, "loss": 0.3592, "step": 62 }, { "epoch": 0.3440273037542662, "grad_norm": 0.9346433422616252, "learning_rate": 9.846692739856023e-06, "loss": 0.3935, "step": 63 }, { "epoch": 0.34948805460750854, "grad_norm": 0.8287634034991234, "learning_rate": 9.834739725916988e-06, "loss": 0.3089, "step": 64 }, { "epoch": 0.35494880546075086, "grad_norm": 0.8040217244181859, "learning_rate": 9.822345875271884e-06, "loss": 0.313, "step": 65 }, { "epoch": 0.3604095563139932, "grad_norm": 0.8053513263090958, "learning_rate": 9.80951231800518e-06, "loss": 0.3355, "step": 66 }, { "epoch": 0.3658703071672355, "grad_norm": 0.7533298814162714, "learning_rate": 9.79624022429427e-06, "loss": 0.3067, "step": 67 }, { "epoch": 0.37133105802047783, "grad_norm": 0.9386782501271983, "learning_rate": 9.782530804302763e-06, "loss": 0.3593, "step": 68 }, { "epoch": 0.37679180887372016, "grad_norm": 0.8507056335702303, "learning_rate": 9.768385308070139e-06, "loss": 0.3629, "step": 69 }, { "epoch": 0.3822525597269625, "grad_norm": 0.782049564136347, "learning_rate": 9.75380502539778e-06, "loss": 0.3458, "step": 70 }, { "epoch": 0.38771331058020475, "grad_norm": 0.9109652113851044, "learning_rate": 9.738791285731353e-06, "loss": 0.348, "step": 71 }, { "epoch": 0.3931740614334471, "grad_norm": 0.8457379953081087, "learning_rate": 9.723345458039595e-06, "loss": 0.3701, "step": 72 }, { "epoch": 0.3986348122866894, "grad_norm": 0.8354411100444213, "learning_rate": 9.70746895068949e-06, "loss": 0.3453, "step": 73 }, { "epoch": 0.4040955631399317, "grad_norm": 0.7824544054631952, "learning_rate": 9.691163211317853e-06, "loss": 0.3393, "step": 74 }, { "epoch": 0.40955631399317405, "grad_norm": 0.7752036290890001, "learning_rate": 9.674429726699324e-06, "loss": 0.3121, "step": 75 }, { "epoch": 0.4150170648464164, "grad_norm": 0.9047037383020493, "learning_rate": 9.657270022610814e-06, "loss": 0.3507, "step": 76 }, { "epoch": 0.4204778156996587, "grad_norm": 0.8453635648693023, "learning_rate": 9.63968566369238e-06, "loss": 0.3641, "step": 77 }, { "epoch": 0.425938566552901, "grad_norm": 0.8290743120901927, "learning_rate": 9.62167825330455e-06, "loss": 0.3739, "step": 78 }, { "epoch": 0.43139931740614335, "grad_norm": 0.8977215293449932, "learning_rate": 9.603249433382145e-06, "loss": 0.3185, "step": 79 }, { "epoch": 0.43686006825938567, "grad_norm": 0.9078617748361664, "learning_rate": 9.584400884284546e-06, "loss": 0.3415, "step": 80 }, { "epoch": 0.442320819112628, "grad_norm": 0.8589830385419883, "learning_rate": 9.565134324642491e-06, "loss": 0.3331, "step": 81 }, { "epoch": 0.4477815699658703, "grad_norm": 0.804380018393787, "learning_rate": 9.545451511201365e-06, "loss": 0.322, "step": 82 }, { "epoch": 0.45324232081911264, "grad_norm": 0.8685230840996425, "learning_rate": 9.52535423866101e-06, "loss": 0.3476, "step": 83 }, { "epoch": 0.45870307167235497, "grad_norm": 0.9643956240091752, "learning_rate": 9.504844339512096e-06, "loss": 0.3671, "step": 84 }, { "epoch": 0.4641638225255973, "grad_norm": 0.8997894029115073, "learning_rate": 9.483923683869025e-06, "loss": 0.352, "step": 85 }, { "epoch": 0.4696245733788396, "grad_norm": 0.9409163478885427, "learning_rate": 9.462594179299408e-06, "loss": 0.3533, "step": 86 }, { "epoch": 0.4750853242320819, "grad_norm": 1.0349789755076704, "learning_rate": 9.440857770650139e-06, "loss": 0.3501, "step": 87 }, { "epoch": 0.4805460750853242, "grad_norm": 0.7719492270463393, "learning_rate": 9.418716439870056e-06, "loss": 0.3092, "step": 88 }, { "epoch": 0.48600682593856653, "grad_norm": 0.9082886643398166, "learning_rate": 9.396172205829235e-06, "loss": 0.3514, "step": 89 }, { "epoch": 0.49146757679180886, "grad_norm": 0.8358389654564478, "learning_rate": 9.373227124134888e-06, "loss": 0.3489, "step": 90 }, { "epoch": 0.4969283276450512, "grad_norm": 0.8974753686960236, "learning_rate": 9.349883286943951e-06, "loss": 0.3632, "step": 91 }, { "epoch": 0.5023890784982935, "grad_norm": 0.8827201059716774, "learning_rate": 9.326142822772301e-06, "loss": 0.3584, "step": 92 }, { "epoch": 0.5078498293515359, "grad_norm": 0.813182991570662, "learning_rate": 9.302007896300697e-06, "loss": 0.3591, "step": 93 }, { "epoch": 0.5133105802047782, "grad_norm": 0.7442842781039997, "learning_rate": 9.27748070817738e-06, "loss": 0.3143, "step": 94 }, { "epoch": 0.5187713310580204, "grad_norm": 0.906866423901588, "learning_rate": 9.252563494817426e-06, "loss": 0.3772, "step": 95 }, { "epoch": 0.5242320819112628, "grad_norm": 0.7894206448318375, "learning_rate": 9.227258528198832e-06, "loss": 0.3131, "step": 96 }, { "epoch": 0.5296928327645051, "grad_norm": 0.8009536933279702, "learning_rate": 9.201568115655343e-06, "loss": 0.329, "step": 97 }, { "epoch": 0.5351535836177475, "grad_norm": 0.8048929927509286, "learning_rate": 9.175494599666078e-06, "loss": 0.3278, "step": 98 }, { "epoch": 0.5406143344709897, "grad_norm": 0.814222453793431, "learning_rate": 9.14904035764193e-06, "loss": 0.3225, "step": 99 }, { "epoch": 0.5460750853242321, "grad_norm": 0.8732367458543802, "learning_rate": 9.122207801708802e-06, "loss": 0.3524, "step": 100 }, { "epoch": 0.5515358361774744, "grad_norm": 0.8134905761183274, "learning_rate": 9.094999378487659e-06, "loss": 0.3546, "step": 101 }, { "epoch": 0.5569965870307167, "grad_norm": 0.8567463415353727, "learning_rate": 9.067417568871444e-06, "loss": 0.3548, "step": 102 }, { "epoch": 0.562457337883959, "grad_norm": 0.8151562254810784, "learning_rate": 9.03946488779887e-06, "loss": 0.3439, "step": 103 }, { "epoch": 0.5679180887372014, "grad_norm": 0.8746438505757359, "learning_rate": 9.0111438840251e-06, "loss": 0.3242, "step": 104 }, { "epoch": 0.5733788395904437, "grad_norm": 0.8085121266810896, "learning_rate": 8.982457139889358e-06, "loss": 0.3598, "step": 105 }, { "epoch": 0.578839590443686, "grad_norm": 0.8191769217039168, "learning_rate": 8.953407271079456e-06, "loss": 0.3425, "step": 106 }, { "epoch": 0.5843003412969283, "grad_norm": 0.874872463262842, "learning_rate": 8.923996926393306e-06, "loss": 0.3795, "step": 107 }, { "epoch": 0.5897610921501707, "grad_norm": 0.8469769243731713, "learning_rate": 8.894228787497389e-06, "loss": 0.3555, "step": 108 }, { "epoch": 0.595221843003413, "grad_norm": 0.7907533312057188, "learning_rate": 8.864105568682245e-06, "loss": 0.3425, "step": 109 }, { "epoch": 0.6006825938566553, "grad_norm": 0.9105392675920642, "learning_rate": 8.833630016614976e-06, "loss": 0.3214, "step": 110 }, { "epoch": 0.6061433447098976, "grad_norm": 0.7743632593675985, "learning_rate": 8.80280491008881e-06, "loss": 0.3477, "step": 111 }, { "epoch": 0.6116040955631399, "grad_norm": 0.9007334854740756, "learning_rate": 8.771633059769712e-06, "loss": 0.3836, "step": 112 }, { "epoch": 0.6170648464163823, "grad_norm": 0.810760704066922, "learning_rate": 8.740117307940123e-06, "loss": 0.3397, "step": 113 }, { "epoch": 0.6225255972696245, "grad_norm": 0.9072711750424595, "learning_rate": 8.708260528239788e-06, "loss": 0.3389, "step": 114 }, { "epoch": 0.6279863481228669, "grad_norm": 0.8621760047744049, "learning_rate": 8.676065625403733e-06, "loss": 0.3788, "step": 115 }, { "epoch": 0.6334470989761092, "grad_norm": 0.8388020553913726, "learning_rate": 8.64353553499741e-06, "loss": 0.3274, "step": 116 }, { "epoch": 0.6389078498293516, "grad_norm": 0.8591445453801212, "learning_rate": 8.610673223149036e-06, "loss": 0.3598, "step": 117 }, { "epoch": 0.6443686006825938, "grad_norm": 0.8057297251815362, "learning_rate": 8.577481686279123e-06, "loss": 0.3522, "step": 118 }, { "epoch": 0.6498293515358362, "grad_norm": 0.779914515334107, "learning_rate": 8.543963950827279e-06, "loss": 0.3416, "step": 119 }, { "epoch": 0.6552901023890785, "grad_norm": 0.8241347234199242, "learning_rate": 8.51012307297624e-06, "loss": 0.341, "step": 120 }, { "epoch": 0.6607508532423209, "grad_norm": 0.7674873201219691, "learning_rate": 8.475962138373212e-06, "loss": 0.3268, "step": 121 }, { "epoch": 0.6662116040955631, "grad_norm": 0.7983877124268901, "learning_rate": 8.441484261848514e-06, "loss": 0.3744, "step": 122 }, { "epoch": 0.6716723549488055, "grad_norm": 0.924444516956386, "learning_rate": 8.406692587131569e-06, "loss": 0.341, "step": 123 }, { "epoch": 0.6771331058020478, "grad_norm": 0.7648013324474998, "learning_rate": 8.371590286564247e-06, "loss": 0.3239, "step": 124 }, { "epoch": 0.6825938566552902, "grad_norm": 0.8275858574184091, "learning_rate": 8.336180560811619e-06, "loss": 0.3588, "step": 125 }, { "epoch": 0.6880546075085324, "grad_norm": 0.7874924257335151, "learning_rate": 8.30046663857011e-06, "loss": 0.3431, "step": 126 }, { "epoch": 0.6935153583617747, "grad_norm": 0.8745641415217126, "learning_rate": 8.264451776273104e-06, "loss": 0.3489, "step": 127 }, { "epoch": 0.6989761092150171, "grad_norm": 0.8858812955767805, "learning_rate": 8.228139257794012e-06, "loss": 0.3595, "step": 128 }, { "epoch": 0.7044368600682593, "grad_norm": 0.812177330348684, "learning_rate": 8.191532394146865e-06, "loss": 0.328, "step": 129 }, { "epoch": 0.7098976109215017, "grad_norm": 0.7755088132854933, "learning_rate": 8.154634523184389e-06, "loss": 0.3392, "step": 130 }, { "epoch": 0.715358361774744, "grad_norm": 0.8715295143660003, "learning_rate": 8.117449009293668e-06, "loss": 0.3482, "step": 131 }, { "epoch": 0.7208191126279864, "grad_norm": 0.7737148258150855, "learning_rate": 8.07997924308938e-06, "loss": 0.3258, "step": 132 }, { "epoch": 0.7262798634812286, "grad_norm": 0.7616464397737633, "learning_rate": 8.042228641104622e-06, "loss": 0.3164, "step": 133 }, { "epoch": 0.731740614334471, "grad_norm": 0.7706843428925245, "learning_rate": 8.004200645479403e-06, "loss": 0.3267, "step": 134 }, { "epoch": 0.7372013651877133, "grad_norm": 0.807087784184599, "learning_rate": 7.965898723646777e-06, "loss": 0.3556, "step": 135 }, { "epoch": 0.7426621160409557, "grad_norm": 0.8571787444948499, "learning_rate": 7.927326368016677e-06, "loss": 0.349, "step": 136 }, { "epoch": 0.7481228668941979, "grad_norm": 0.758611440956407, "learning_rate": 7.888487095657484e-06, "loss": 0.3301, "step": 137 }, { "epoch": 0.7535836177474403, "grad_norm": 0.7924167608304931, "learning_rate": 7.849384447975322e-06, "loss": 0.3534, "step": 138 }, { "epoch": 0.7590443686006826, "grad_norm": 0.8750460208633537, "learning_rate": 7.810021990391163e-06, "loss": 0.3405, "step": 139 }, { "epoch": 0.764505119453925, "grad_norm": 0.7895037781717571, "learning_rate": 7.77040331201572e-06, "loss": 0.3678, "step": 140 }, { "epoch": 0.7699658703071672, "grad_norm": 0.8223142510608592, "learning_rate": 7.73053202532219e-06, "loss": 0.3469, "step": 141 }, { "epoch": 0.7754266211604095, "grad_norm": 0.8172036706667312, "learning_rate": 7.690411765816864e-06, "loss": 0.3395, "step": 142 }, { "epoch": 0.7808873720136519, "grad_norm": 0.7717521340469524, "learning_rate": 7.650046191707641e-06, "loss": 0.3352, "step": 143 }, { "epoch": 0.7863481228668942, "grad_norm": 0.8745843729938327, "learning_rate": 7.609438983570461e-06, "loss": 0.34, "step": 144 }, { "epoch": 0.7918088737201365, "grad_norm": 0.8462879664073518, "learning_rate": 7.5685938440137185e-06, "loss": 0.3434, "step": 145 }, { "epoch": 0.7972696245733788, "grad_norm": 0.8887194246240154, "learning_rate": 7.527514497340642e-06, "loss": 0.3536, "step": 146 }, { "epoch": 0.8027303754266212, "grad_norm": 0.720734982965855, "learning_rate": 7.486204689209719e-06, "loss": 0.3071, "step": 147 }, { "epoch": 0.8081911262798634, "grad_norm": 0.7852180915891143, "learning_rate": 7.444668186293153e-06, "loss": 0.3318, "step": 148 }, { "epoch": 0.8136518771331058, "grad_norm": 0.8169236397844766, "learning_rate": 7.402908775933419e-06, "loss": 0.3282, "step": 149 }, { "epoch": 0.8191126279863481, "grad_norm": 0.8409146266167947, "learning_rate": 7.360930265797934e-06, "loss": 0.3592, "step": 150 }, { "epoch": 0.8245733788395905, "grad_norm": 0.7893736430445095, "learning_rate": 7.318736483531861e-06, "loss": 0.3455, "step": 151 }, { "epoch": 0.8300341296928327, "grad_norm": 0.7092487578490618, "learning_rate": 7.2763312764091055e-06, "loss": 0.307, "step": 152 }, { "epoch": 0.8354948805460751, "grad_norm": 0.7643841671055314, "learning_rate": 7.23371851098152e-06, "loss": 0.3104, "step": 153 }, { "epoch": 0.8409556313993174, "grad_norm": 0.8743703462981528, "learning_rate": 7.190902072726336e-06, "loss": 0.3601, "step": 154 }, { "epoch": 0.8464163822525598, "grad_norm": 0.8748161240027253, "learning_rate": 7.147885865691899e-06, "loss": 0.3592, "step": 155 }, { "epoch": 0.851877133105802, "grad_norm": 0.6528952892311825, "learning_rate": 7.104673812141676e-06, "loss": 0.2919, "step": 156 }, { "epoch": 0.8573378839590444, "grad_norm": 0.8161745547126792, "learning_rate": 7.061269852196633e-06, "loss": 0.345, "step": 157 }, { "epoch": 0.8627986348122867, "grad_norm": 0.8321903783865391, "learning_rate": 7.017677943475962e-06, "loss": 0.321, "step": 158 }, { "epoch": 0.868259385665529, "grad_norm": 0.83313681444351, "learning_rate": 6.973902060736226e-06, "loss": 0.3435, "step": 159 }, { "epoch": 0.8737201365187713, "grad_norm": 0.7505151585539925, "learning_rate": 6.929946195508933e-06, "loss": 0.3163, "step": 160 }, { "epoch": 0.8791808873720136, "grad_norm": 0.7304802524364322, "learning_rate": 6.8858143557365865e-06, "loss": 0.328, "step": 161 }, { "epoch": 0.884641638225256, "grad_norm": 0.8143420713928606, "learning_rate": 6.841510565407235e-06, "loss": 0.3341, "step": 162 }, { "epoch": 0.8901023890784983, "grad_norm": 0.7567204344075086, "learning_rate": 6.797038864187564e-06, "loss": 0.3059, "step": 163 }, { "epoch": 0.8955631399317406, "grad_norm": 0.7826567782778101, "learning_rate": 6.752403307054549e-06, "loss": 0.3283, "step": 164 }, { "epoch": 0.9010238907849829, "grad_norm": 0.7886900433942758, "learning_rate": 6.707607963925725e-06, "loss": 0.3592, "step": 165 }, { "epoch": 0.9064846416382253, "grad_norm": 0.820709571232716, "learning_rate": 6.66265691928808e-06, "loss": 0.3605, "step": 166 }, { "epoch": 0.9119453924914676, "grad_norm": 0.7681789982648866, "learning_rate": 6.617554271825636e-06, "loss": 0.3051, "step": 167 }, { "epoch": 0.9174061433447099, "grad_norm": 0.8006459558215293, "learning_rate": 6.5723041340457175e-06, "loss": 0.3542, "step": 168 }, { "epoch": 0.9228668941979522, "grad_norm": 0.7333102829214887, "learning_rate": 6.526910631903973e-06, "loss": 0.3254, "step": 169 }, { "epoch": 0.9283276450511946, "grad_norm": 0.7766899671870917, "learning_rate": 6.481377904428171e-06, "loss": 0.3297, "step": 170 }, { "epoch": 0.9337883959044369, "grad_norm": 0.8887532080533157, "learning_rate": 6.435710103340787e-06, "loss": 0.3531, "step": 171 }, { "epoch": 0.9392491467576792, "grad_norm": 0.7606421967689092, "learning_rate": 6.3899113926804565e-06, "loss": 0.3279, "step": 172 }, { "epoch": 0.9447098976109215, "grad_norm": 0.7894203946388427, "learning_rate": 6.3439859484222874e-06, "loss": 0.3206, "step": 173 }, { "epoch": 0.9501706484641638, "grad_norm": 0.8106143896629081, "learning_rate": 6.297937958097094e-06, "loss": 0.3185, "step": 174 }, { "epoch": 0.9556313993174061, "grad_norm": 0.7673331407317434, "learning_rate": 6.251771620409563e-06, "loss": 0.3408, "step": 175 }, { "epoch": 0.9610921501706484, "grad_norm": 0.7678720102410665, "learning_rate": 6.205491144855432e-06, "loss": 0.3388, "step": 176 }, { "epoch": 0.9665529010238908, "grad_norm": 0.8058357314804626, "learning_rate": 6.1591007513376425e-06, "loss": 0.348, "step": 177 }, { "epoch": 0.9720136518771331, "grad_norm": 0.7150290944167804, "learning_rate": 6.112604669781572e-06, "loss": 0.3187, "step": 178 }, { "epoch": 0.9774744027303754, "grad_norm": 0.7724885943742522, "learning_rate": 6.066007139749351e-06, "loss": 0.3112, "step": 179 }, { "epoch": 0.9829351535836177, "grad_norm": 0.7079636144073458, "learning_rate": 6.019312410053286e-06, "loss": 0.3115, "step": 180 }, { "epoch": 0.9883959044368601, "grad_norm": 0.7145124027416198, "learning_rate": 5.972524738368452e-06, "loss": 0.3015, "step": 181 }, { "epoch": 0.9938566552901024, "grad_norm": 0.7747190577463166, "learning_rate": 5.925648390844476e-06, "loss": 0.3405, "step": 182 }, { "epoch": 0.9993174061433447, "grad_norm": 0.7411495697672651, "learning_rate": 5.878687641716539e-06, "loss": 0.3241, "step": 183 }, { "epoch": 1.004778156996587, "grad_norm": 2.2845026949908367, "learning_rate": 5.831646772915651e-06, "loss": 0.5887, "step": 184 }, { "epoch": 1.0102389078498293, "grad_norm": 0.767067987662548, "learning_rate": 5.7845300736782205e-06, "loss": 0.2696, "step": 185 }, { "epoch": 1.0156996587030718, "grad_norm": 0.6444558204539116, "learning_rate": 5.7373418401549565e-06, "loss": 0.2179, "step": 186 }, { "epoch": 1.021160409556314, "grad_norm": 0.6326875020427418, "learning_rate": 5.690086375019135e-06, "loss": 0.2063, "step": 187 }, { "epoch": 1.0266211604095563, "grad_norm": 0.6386737207560813, "learning_rate": 5.642767987074288e-06, "loss": 0.2395, "step": 188 }, { "epoch": 1.0320819112627986, "grad_norm": 0.7191360204333792, "learning_rate": 5.595390990861311e-06, "loss": 0.2593, "step": 189 }, { "epoch": 1.0375426621160408, "grad_norm": 0.7139153218951271, "learning_rate": 5.547959706265068e-06, "loss": 0.25, "step": 190 }, { "epoch": 1.0430034129692833, "grad_norm": 0.7132174285440807, "learning_rate": 5.500478458120493e-06, "loss": 0.2656, "step": 191 }, { "epoch": 1.0484641638225256, "grad_norm": 0.7373698973683902, "learning_rate": 5.45295157581825e-06, "loss": 0.2643, "step": 192 }, { "epoch": 1.0539249146757679, "grad_norm": 0.6806301830473086, "learning_rate": 5.405383392909973e-06, "loss": 0.2521, "step": 193 }, { "epoch": 1.0593856655290101, "grad_norm": 0.6785558579468979, "learning_rate": 5.357778246713131e-06, "loss": 0.254, "step": 194 }, { "epoch": 1.0648464163822526, "grad_norm": 0.6701400051635917, "learning_rate": 5.310140477915544e-06, "loss": 0.2303, "step": 195 }, { "epoch": 1.070307167235495, "grad_norm": 0.7408218245910705, "learning_rate": 5.262474430179597e-06, "loss": 0.2587, "step": 196 }, { "epoch": 1.0757679180887372, "grad_norm": 0.652029496994987, "learning_rate": 5.2147844497461745e-06, "loss": 0.2201, "step": 197 }, { "epoch": 1.0812286689419794, "grad_norm": 0.6145756306559864, "learning_rate": 5.1670748850383734e-06, "loss": 0.2131, "step": 198 }, { "epoch": 1.086689419795222, "grad_norm": 0.6166424191446999, "learning_rate": 5.1193500862650045e-06, "loss": 0.2272, "step": 199 }, { "epoch": 1.0921501706484642, "grad_norm": 0.6183167944122974, "learning_rate": 5.071614405023938e-06, "loss": 0.2239, "step": 200 }, { "epoch": 1.0976109215017065, "grad_norm": 0.7073017446193981, "learning_rate": 5.023872193905316e-06, "loss": 0.2564, "step": 201 }, { "epoch": 1.1030716723549487, "grad_norm": 0.6785368103283103, "learning_rate": 4.976127806094685e-06, "loss": 0.2598, "step": 202 }, { "epoch": 1.108532423208191, "grad_norm": 0.6686099383276679, "learning_rate": 4.928385594976063e-06, "loss": 0.2391, "step": 203 }, { "epoch": 1.1139931740614335, "grad_norm": 0.6046536649329635, "learning_rate": 4.880649913734996e-06, "loss": 0.2111, "step": 204 }, { "epoch": 1.1194539249146758, "grad_norm": 0.6455972829075776, "learning_rate": 4.832925114961629e-06, "loss": 0.2291, "step": 205 }, { "epoch": 1.124914675767918, "grad_norm": 0.6294601922178525, "learning_rate": 4.785215550253826e-06, "loss": 0.2237, "step": 206 }, { "epoch": 1.1303754266211605, "grad_norm": 0.6539972986726327, "learning_rate": 4.737525569820405e-06, "loss": 0.2415, "step": 207 }, { "epoch": 1.1358361774744028, "grad_norm": 0.6841776523547041, "learning_rate": 4.689859522084457e-06, "loss": 0.2573, "step": 208 }, { "epoch": 1.141296928327645, "grad_norm": 0.708275852733329, "learning_rate": 4.64222175328687e-06, "loss": 0.2535, "step": 209 }, { "epoch": 1.1467576791808873, "grad_norm": 0.6327858698732379, "learning_rate": 4.594616607090028e-06, "loss": 0.2284, "step": 210 }, { "epoch": 1.1522184300341296, "grad_norm": 0.6408257648532151, "learning_rate": 4.547048424181751e-06, "loss": 0.2294, "step": 211 }, { "epoch": 1.157679180887372, "grad_norm": 0.6159123552870842, "learning_rate": 4.499521541879508e-06, "loss": 0.2226, "step": 212 }, { "epoch": 1.1631399317406144, "grad_norm": 0.5823300781202381, "learning_rate": 4.452040293734934e-06, "loss": 0.2108, "step": 213 }, { "epoch": 1.1686006825938566, "grad_norm": 0.6041391928150867, "learning_rate": 4.40460900913869e-06, "loss": 0.2224, "step": 214 }, { "epoch": 1.174061433447099, "grad_norm": 0.6641306892375002, "learning_rate": 4.357232012925714e-06, "loss": 0.2384, "step": 215 }, { "epoch": 1.1795221843003414, "grad_norm": 0.6503207204016519, "learning_rate": 4.309913624980866e-06, "loss": 0.2347, "step": 216 }, { "epoch": 1.1849829351535837, "grad_norm": 0.62805580635999, "learning_rate": 4.262658159845046e-06, "loss": 0.229, "step": 217 }, { "epoch": 1.190443686006826, "grad_norm": 0.6275617918722145, "learning_rate": 4.2154699263217794e-06, "loss": 0.2286, "step": 218 }, { "epoch": 1.1959044368600682, "grad_norm": 0.7617460871871701, "learning_rate": 4.1683532270843505e-06, "loss": 0.2574, "step": 219 }, { "epoch": 1.2013651877133107, "grad_norm": 0.6140110702778818, "learning_rate": 4.121312358283464e-06, "loss": 0.2149, "step": 220 }, { "epoch": 1.206825938566553, "grad_norm": 0.6247259244040378, "learning_rate": 4.074351609155527e-06, "loss": 0.2381, "step": 221 }, { "epoch": 1.2122866894197952, "grad_norm": 0.6591171660703798, "learning_rate": 4.0274752616315485e-06, "loss": 0.2344, "step": 222 }, { "epoch": 1.2177474402730375, "grad_norm": 0.6436293629709356, "learning_rate": 3.980687589946715e-06, "loss": 0.2319, "step": 223 }, { "epoch": 1.2232081911262798, "grad_norm": 0.7670288305294722, "learning_rate": 3.9339928602506505e-06, "loss": 0.2497, "step": 224 }, { "epoch": 1.2286689419795223, "grad_norm": 0.6962942427446093, "learning_rate": 3.887395330218429e-06, "loss": 0.2336, "step": 225 }, { "epoch": 1.2341296928327645, "grad_norm": 0.6965676226191562, "learning_rate": 3.840899248662358e-06, "loss": 0.2552, "step": 226 }, { "epoch": 1.2395904436860068, "grad_norm": 0.6641042493963545, "learning_rate": 3.7945088551445698e-06, "loss": 0.2563, "step": 227 }, { "epoch": 1.245051194539249, "grad_norm": 0.6196955536234605, "learning_rate": 3.748228379590438e-06, "loss": 0.2291, "step": 228 }, { "epoch": 1.2505119453924913, "grad_norm": 0.6278753799486634, "learning_rate": 3.7020620419029095e-06, "loss": 0.2141, "step": 229 }, { "epoch": 1.2559726962457338, "grad_norm": 0.6082644009007588, "learning_rate": 3.656014051577713e-06, "loss": 0.2122, "step": 230 }, { "epoch": 1.261433447098976, "grad_norm": 0.6736359984841271, "learning_rate": 3.610088607319544e-06, "loss": 0.2367, "step": 231 }, { "epoch": 1.2668941979522184, "grad_norm": 0.6329301932656438, "learning_rate": 3.5642898966592145e-06, "loss": 0.235, "step": 232 }, { "epoch": 1.2723549488054609, "grad_norm": 0.619963960271499, "learning_rate": 3.518622095571831e-06, "loss": 0.2208, "step": 233 }, { "epoch": 1.2778156996587031, "grad_norm": 0.6693990598652739, "learning_rate": 3.4730893680960267e-06, "loss": 0.2406, "step": 234 }, { "epoch": 1.2832764505119454, "grad_norm": 0.6864592317171182, "learning_rate": 3.4276958659542838e-06, "loss": 0.243, "step": 235 }, { "epoch": 1.2887372013651877, "grad_norm": 0.7236681291816511, "learning_rate": 3.382445728174365e-06, "loss": 0.2586, "step": 236 }, { "epoch": 1.29419795221843, "grad_norm": 0.6176752667693888, "learning_rate": 3.3373430807119212e-06, "loss": 0.2251, "step": 237 }, { "epoch": 1.2996587030716724, "grad_norm": 0.7022262485772638, "learning_rate": 3.292392036074277e-06, "loss": 0.2316, "step": 238 }, { "epoch": 1.3051194539249147, "grad_norm": 0.6404240889042992, "learning_rate": 3.2475966929454505e-06, "loss": 0.2384, "step": 239 }, { "epoch": 1.310580204778157, "grad_norm": 0.7080127357360425, "learning_rate": 3.202961135812437e-06, "loss": 0.248, "step": 240 }, { "epoch": 1.3160409556313994, "grad_norm": 0.668638034601711, "learning_rate": 3.1584894345927663e-06, "loss": 0.2212, "step": 241 }, { "epoch": 1.3215017064846417, "grad_norm": 0.6729621818012642, "learning_rate": 3.114185644263415e-06, "loss": 0.222, "step": 242 }, { "epoch": 1.326962457337884, "grad_norm": 0.6723316181938683, "learning_rate": 3.0700538044910684e-06, "loss": 0.2246, "step": 243 }, { "epoch": 1.3324232081911263, "grad_norm": 0.6671061425745013, "learning_rate": 3.0260979392637753e-06, "loss": 0.2518, "step": 244 }, { "epoch": 1.3378839590443685, "grad_norm": 0.5962051816320753, "learning_rate": 2.9823220565240396e-06, "loss": 0.2224, "step": 245 }, { "epoch": 1.343344709897611, "grad_norm": 0.6511243444073086, "learning_rate": 2.9387301478033694e-06, "loss": 0.2521, "step": 246 }, { "epoch": 1.3488054607508533, "grad_norm": 0.6112346949791394, "learning_rate": 2.8953261878583263e-06, "loss": 0.2164, "step": 247 }, { "epoch": 1.3542662116040955, "grad_norm": 0.6844064518509092, "learning_rate": 2.852114134308104e-06, "loss": 0.2532, "step": 248 }, { "epoch": 1.3597269624573378, "grad_norm": 0.6383549765315465, "learning_rate": 2.8090979272736663e-06, "loss": 0.2401, "step": 249 }, { "epoch": 1.36518771331058, "grad_norm": 0.5944587646093914, "learning_rate": 2.766281489018482e-06, "loss": 0.2293, "step": 250 }, { "epoch": 1.3706484641638226, "grad_norm": 0.6307687611402706, "learning_rate": 2.7236687235908953e-06, "loss": 0.2188, "step": 251 }, { "epoch": 1.3761092150170648, "grad_norm": 0.6238745929037437, "learning_rate": 2.681263516468139e-06, "loss": 0.2475, "step": 252 }, { "epoch": 1.3815699658703071, "grad_norm": 0.6373763499985442, "learning_rate": 2.6390697342020665e-06, "loss": 0.2343, "step": 253 }, { "epoch": 1.3870307167235496, "grad_norm": 0.6734367954708225, "learning_rate": 2.5970912240665815e-06, "loss": 0.2353, "step": 254 }, { "epoch": 1.3924914675767919, "grad_norm": 0.6582720475674197, "learning_rate": 2.5553318137068473e-06, "loss": 0.2474, "step": 255 }, { "epoch": 1.3979522184300341, "grad_norm": 0.6052116695135601, "learning_rate": 2.5137953107902814e-06, "loss": 0.2322, "step": 256 }, { "epoch": 1.4034129692832764, "grad_norm": 0.6552372504854818, "learning_rate": 2.472485502659358e-06, "loss": 0.2468, "step": 257 }, { "epoch": 1.4088737201365187, "grad_norm": 0.6302233591025854, "learning_rate": 2.4314061559862836e-06, "loss": 0.2398, "step": 258 }, { "epoch": 1.4143344709897612, "grad_norm": 0.654084080401935, "learning_rate": 2.3905610164295394e-06, "loss": 0.2329, "step": 259 }, { "epoch": 1.4197952218430034, "grad_norm": 0.6485524489438387, "learning_rate": 2.3499538082923607e-06, "loss": 0.2446, "step": 260 }, { "epoch": 1.4252559726962457, "grad_norm": 0.6260148485778105, "learning_rate": 2.309588234183137e-06, "loss": 0.215, "step": 261 }, { "epoch": 1.430716723549488, "grad_norm": 0.6165139801898837, "learning_rate": 2.2694679746778116e-06, "loss": 0.2235, "step": 262 }, { "epoch": 1.4361774744027302, "grad_norm": 0.6073663452431178, "learning_rate": 2.22959668798428e-06, "loss": 0.21, "step": 263 }, { "epoch": 1.4416382252559727, "grad_norm": 0.6687068934985456, "learning_rate": 2.1899780096088375e-06, "loss": 0.2609, "step": 264 }, { "epoch": 1.447098976109215, "grad_norm": 0.5999286753849784, "learning_rate": 2.1506155520246795e-06, "loss": 0.2275, "step": 265 }, { "epoch": 1.4525597269624573, "grad_norm": 0.6562378405208374, "learning_rate": 2.1115129043425188e-06, "loss": 0.2577, "step": 266 }, { "epoch": 1.4580204778156998, "grad_norm": 0.6564390690276799, "learning_rate": 2.072673631983323e-06, "loss": 0.2583, "step": 267 }, { "epoch": 1.463481228668942, "grad_norm": 0.6175405413313049, "learning_rate": 2.0341012763532243e-06, "loss": 0.2252, "step": 268 }, { "epoch": 1.4689419795221843, "grad_norm": 0.6294576454771881, "learning_rate": 1.995799354520598e-06, "loss": 0.2282, "step": 269 }, { "epoch": 1.4744027303754266, "grad_norm": 0.6491408881404807, "learning_rate": 1.9577713588953797e-06, "loss": 0.2204, "step": 270 }, { "epoch": 1.4798634812286688, "grad_norm": 0.626492440911862, "learning_rate": 1.9200207569106216e-06, "loss": 0.2363, "step": 271 }, { "epoch": 1.4853242320819113, "grad_norm": 0.6328542452711655, "learning_rate": 1.8825509907063328e-06, "loss": 0.2312, "step": 272 }, { "epoch": 1.4907849829351536, "grad_norm": 0.6502462448470019, "learning_rate": 1.8453654768156138e-06, "loss": 0.2512, "step": 273 }, { "epoch": 1.4962457337883959, "grad_norm": 0.6012861830171234, "learning_rate": 1.8084676058531376e-06, "loss": 0.2285, "step": 274 }, { "epoch": 1.5017064846416384, "grad_norm": 0.6834276368175269, "learning_rate": 1.771860742205988e-06, "loss": 0.2512, "step": 275 }, { "epoch": 1.5071672354948804, "grad_norm": 0.6528553917231606, "learning_rate": 1.7355482237268983e-06, "loss": 0.2382, "step": 276 }, { "epoch": 1.512627986348123, "grad_norm": 0.5819350883243867, "learning_rate": 1.6995333614298908e-06, "loss": 0.2097, "step": 277 }, { "epoch": 1.5180887372013652, "grad_norm": 0.6231114790212122, "learning_rate": 1.6638194391883822e-06, "loss": 0.2352, "step": 278 }, { "epoch": 1.5235494880546074, "grad_norm": 0.6082248229800555, "learning_rate": 1.6284097134357535e-06, "loss": 0.2241, "step": 279 }, { "epoch": 1.52901023890785, "grad_norm": 0.5824479788396233, "learning_rate": 1.5933074128684333e-06, "loss": 0.2149, "step": 280 }, { "epoch": 1.5344709897610922, "grad_norm": 0.6414988574509947, "learning_rate": 1.5585157381514875e-06, "loss": 0.2629, "step": 281 }, { "epoch": 1.5399317406143345, "grad_norm": 0.6281763626708758, "learning_rate": 1.5240378616267887e-06, "loss": 0.2213, "step": 282 }, { "epoch": 1.545392491467577, "grad_norm": 0.6342381052621304, "learning_rate": 1.4898769270237611e-06, "loss": 0.2469, "step": 283 }, { "epoch": 1.550853242320819, "grad_norm": 0.63348306460088, "learning_rate": 1.4560360491727233e-06, "loss": 0.2369, "step": 284 }, { "epoch": 1.5563139931740615, "grad_norm": 0.6211597914243564, "learning_rate": 1.4225183137208775e-06, "loss": 0.2464, "step": 285 }, { "epoch": 1.5617747440273038, "grad_norm": 0.6287566376737247, "learning_rate": 1.389326776850966e-06, "loss": 0.2378, "step": 286 }, { "epoch": 1.567235494880546, "grad_norm": 0.6155227503256832, "learning_rate": 1.3564644650025894e-06, "loss": 0.2501, "step": 287 }, { "epoch": 1.5726962457337885, "grad_norm": 0.6705552802557545, "learning_rate": 1.323934374596268e-06, "loss": 0.2642, "step": 288 }, { "epoch": 1.5781569965870306, "grad_norm": 0.6468155028186762, "learning_rate": 1.2917394717602123e-06, "loss": 0.2391, "step": 289 }, { "epoch": 1.583617747440273, "grad_norm": 0.6059123207863814, "learning_rate": 1.2598826920598773e-06, "loss": 0.2471, "step": 290 }, { "epoch": 1.5890784982935153, "grad_norm": 0.622906276078447, "learning_rate": 1.2283669402302878e-06, "loss": 0.2441, "step": 291 }, { "epoch": 1.5945392491467576, "grad_norm": 0.6123734431964898, "learning_rate": 1.197195089911191e-06, "loss": 0.2359, "step": 292 }, { "epoch": 1.6, "grad_norm": 0.6173696097759185, "learning_rate": 1.166369983385024e-06, "loss": 0.2302, "step": 293 }, { "epoch": 1.6054607508532424, "grad_norm": 0.6309581460125253, "learning_rate": 1.1358944313177566e-06, "loss": 0.2312, "step": 294 }, { "epoch": 1.6109215017064846, "grad_norm": 0.6390016010862638, "learning_rate": 1.1057712125026116e-06, "loss": 0.2442, "step": 295 }, { "epoch": 1.6163822525597271, "grad_norm": 0.6276135127186313, "learning_rate": 1.0760030736066952e-06, "loss": 0.2297, "step": 296 }, { "epoch": 1.6218430034129692, "grad_norm": 0.612896281499447, "learning_rate": 1.0465927289205452e-06, "loss": 0.2346, "step": 297 }, { "epoch": 1.6273037542662117, "grad_norm": 0.5859909784104106, "learning_rate": 1.0175428601106441e-06, "loss": 0.2119, "step": 298 }, { "epoch": 1.632764505119454, "grad_norm": 0.6209786013367464, "learning_rate": 9.888561159748995e-07, "loss": 0.2432, "step": 299 }, { "epoch": 1.6382252559726962, "grad_norm": 0.6352233419147338, "learning_rate": 9.605351122011308e-07, "loss": 0.2392, "step": 300 }, { "epoch": 1.6436860068259387, "grad_norm": 0.5719749997080835, "learning_rate": 9.325824311285564e-07, "loss": 0.2173, "step": 301 }, { "epoch": 1.6491467576791807, "grad_norm": 0.6657684609315078, "learning_rate": 9.050006215123419e-07, "loss": 0.2606, "step": 302 }, { "epoch": 1.6546075085324232, "grad_norm": 0.5728781873883457, "learning_rate": 8.777921982911996e-07, "loss": 0.2214, "step": 303 }, { "epoch": 1.6600682593856655, "grad_norm": 0.6162844560531199, "learning_rate": 8.509596423580712e-07, "loss": 0.2464, "step": 304 }, { "epoch": 1.6655290102389078, "grad_norm": 0.5952374213138941, "learning_rate": 8.245054003339247e-07, "loss": 0.226, "step": 305 }, { "epoch": 1.6709897610921502, "grad_norm": 0.5961891699890017, "learning_rate": 7.984318843446593e-07, "loss": 0.2221, "step": 306 }, { "epoch": 1.6764505119453925, "grad_norm": 0.5937715818201527, "learning_rate": 7.727414718011706e-07, "loss": 0.2117, "step": 307 }, { "epoch": 1.6819112627986348, "grad_norm": 0.6853681886295775, "learning_rate": 7.474365051825749e-07, "loss": 0.257, "step": 308 }, { "epoch": 1.6873720136518773, "grad_norm": 0.6359287265633253, "learning_rate": 7.225192918226215e-07, "loss": 0.2395, "step": 309 }, { "epoch": 1.6928327645051193, "grad_norm": 0.5963026735275871, "learning_rate": 6.979921036993042e-07, "loss": 0.2233, "step": 310 }, { "epoch": 1.6982935153583618, "grad_norm": 0.6319949149208983, "learning_rate": 6.738571772276997e-07, "loss": 0.2416, "step": 311 }, { "epoch": 1.703754266211604, "grad_norm": 0.6203658184125409, "learning_rate": 6.501167130560515e-07, "loss": 0.2283, "step": 312 }, { "epoch": 1.7092150170648464, "grad_norm": 0.5938364868062161, "learning_rate": 6.267728758651131e-07, "loss": 0.2302, "step": 313 }, { "epoch": 1.7146757679180888, "grad_norm": 0.6087142064405593, "learning_rate": 6.038277941707671e-07, "loss": 0.2039, "step": 314 }, { "epoch": 1.7201365187713311, "grad_norm": 0.6401443657158584, "learning_rate": 5.812835601299438e-07, "loss": 0.254, "step": 315 }, { "epoch": 1.7255972696245734, "grad_norm": 0.5597403672115617, "learning_rate": 5.591422293498633e-07, "loss": 0.2074, "step": 316 }, { "epoch": 1.7310580204778157, "grad_norm": 0.6510913944426799, "learning_rate": 5.374058207005945e-07, "loss": 0.242, "step": 317 }, { "epoch": 1.736518771331058, "grad_norm": 0.5851164556251767, "learning_rate": 5.160763161309768e-07, "loss": 0.2208, "step": 318 }, { "epoch": 1.7419795221843004, "grad_norm": 0.5863228147924701, "learning_rate": 4.951556604879049e-07, "loss": 0.2167, "step": 319 }, { "epoch": 1.7474402730375427, "grad_norm": 0.6434185214532853, "learning_rate": 4.7464576133899043e-07, "loss": 0.2208, "step": 320 }, { "epoch": 1.752901023890785, "grad_norm": 0.6536708886817881, "learning_rate": 4.545484887986368e-07, "loss": 0.2558, "step": 321 }, { "epoch": 1.7583617747440274, "grad_norm": 0.5913503349988765, "learning_rate": 4.348656753575092e-07, "loss": 0.2412, "step": 322 }, { "epoch": 1.7638225255972695, "grad_norm": 0.6056503972371652, "learning_rate": 4.1559911571545544e-07, "loss": 0.2302, "step": 323 }, { "epoch": 1.769283276450512, "grad_norm": 0.6287557474105558, "learning_rate": 3.9675056661785563e-07, "loss": 0.2184, "step": 324 }, { "epoch": 1.7747440273037542, "grad_norm": 0.6387002743522794, "learning_rate": 3.783217466954503e-07, "loss": 0.2302, "step": 325 }, { "epoch": 1.7802047781569965, "grad_norm": 0.5919282825342829, "learning_rate": 3.603143363076217e-07, "loss": 0.2155, "step": 326 }, { "epoch": 1.785665529010239, "grad_norm": 0.6473381627525607, "learning_rate": 3.427299773891868e-07, "loss": 0.2661, "step": 327 }, { "epoch": 1.7911262798634813, "grad_norm": 0.5931604711623788, "learning_rate": 3.255702733006766e-07, "loss": 0.2338, "step": 328 }, { "epoch": 1.7965870307167235, "grad_norm": 0.6129008231127687, "learning_rate": 3.088367886821481e-07, "loss": 0.2514, "step": 329 }, { "epoch": 1.802047781569966, "grad_norm": 0.6062339345391793, "learning_rate": 2.925310493105099e-07, "loss": 0.208, "step": 330 }, { "epoch": 1.807508532423208, "grad_norm": 0.640691825777882, "learning_rate": 2.7665454196040665e-07, "loss": 0.2568, "step": 331 }, { "epoch": 1.8129692832764506, "grad_norm": 0.6228877907424235, "learning_rate": 2.6120871426864866e-07, "loss": 0.2445, "step": 332 }, { "epoch": 1.8184300341296928, "grad_norm": 0.6109720444847905, "learning_rate": 2.4619497460222184e-07, "loss": 0.2408, "step": 333 }, { "epoch": 1.823890784982935, "grad_norm": 0.6345094654832795, "learning_rate": 2.316146919298623e-07, "loss": 0.221, "step": 334 }, { "epoch": 1.8293515358361776, "grad_norm": 0.5612296424074616, "learning_rate": 2.1746919569723858e-07, "loss": 0.2137, "step": 335 }, { "epoch": 1.8348122866894196, "grad_norm": 0.5831279872950338, "learning_rate": 2.037597757057297e-07, "loss": 0.2178, "step": 336 }, { "epoch": 1.8402730375426621, "grad_norm": 0.639477342500686, "learning_rate": 1.9048768199481983e-07, "loss": 0.2417, "step": 337 }, { "epoch": 1.8457337883959044, "grad_norm": 0.5861730216308635, "learning_rate": 1.776541247281177e-07, "loss": 0.229, "step": 338 }, { "epoch": 1.8511945392491467, "grad_norm": 0.6019956045442054, "learning_rate": 1.6526027408301227e-07, "loss": 0.2212, "step": 339 }, { "epoch": 1.8566552901023892, "grad_norm": 0.59132952240458, "learning_rate": 1.5330726014397668e-07, "loss": 0.2301, "step": 340 }, { "epoch": 1.8621160409556314, "grad_norm": 0.6078005334703832, "learning_rate": 1.417961727995254e-07, "loss": 0.2239, "step": 341 }, { "epoch": 1.8675767918088737, "grad_norm": 0.585630251596369, "learning_rate": 1.307280616428336e-07, "loss": 0.2093, "step": 342 }, { "epoch": 1.8730375426621162, "grad_norm": 0.6003005675220531, "learning_rate": 1.2010393587603975e-07, "loss": 0.2558, "step": 343 }, { "epoch": 1.8784982935153582, "grad_norm": 0.6007862550074166, "learning_rate": 1.0992476421822052e-07, "loss": 0.2217, "step": 344 }, { "epoch": 1.8839590443686007, "grad_norm": 0.5798264754441842, "learning_rate": 1.0019147481706626e-07, "loss": 0.2, "step": 345 }, { "epoch": 1.889419795221843, "grad_norm": 0.5801152259959127, "learning_rate": 9.090495516424713e-08, "loss": 0.2219, "step": 346 }, { "epoch": 1.8948805460750853, "grad_norm": 0.5923845690395708, "learning_rate": 8.206605201449447e-08, "loss": 0.2029, "step": 347 }, { "epoch": 1.9003412969283278, "grad_norm": 0.5987478859251119, "learning_rate": 7.367557130838921e-08, "loss": 0.2256, "step": 348 }, { "epoch": 1.9058020477815698, "grad_norm": 0.5842539453063824, "learning_rate": 6.573427809888067e-08, "loss": 0.2003, "step": 349 }, { "epoch": 1.9112627986348123, "grad_norm": 0.6351293922330473, "learning_rate": 5.824289648152126e-08, "loss": 0.2395, "step": 350 }, { "epoch": 1.9167235494880546, "grad_norm": 0.6674020151826169, "learning_rate": 5.120210952844873e-08, "loss": 0.271, "step": 351 }, { "epoch": 1.9221843003412968, "grad_norm": 0.5675294646121625, "learning_rate": 4.461255922609986e-08, "loss": 0.2172, "step": 352 }, { "epoch": 1.9276450511945393, "grad_norm": 0.6285436049388992, "learning_rate": 3.8474846416672874e-08, "loss": 0.2399, "step": 353 }, { "epoch": 1.9331058020477816, "grad_norm": 0.6175868696026672, "learning_rate": 3.278953074334512e-08, "loss": 0.2212, "step": 354 }, { "epoch": 1.9385665529010239, "grad_norm": 0.6190764065365002, "learning_rate": 2.75571305992417e-08, "loss": 0.2374, "step": 355 }, { "epoch": 1.9440273037542664, "grad_norm": 0.6366754795576353, "learning_rate": 2.2778123080167136e-08, "loss": 0.2317, "step": 356 }, { "epoch": 1.9494880546075084, "grad_norm": 0.6088811286257059, "learning_rate": 1.845294394110686e-08, "loss": 0.2161, "step": 357 }, { "epoch": 1.954948805460751, "grad_norm": 0.5983150959140915, "learning_rate": 1.4581987556490095e-08, "loss": 0.2273, "step": 358 }, { "epoch": 1.9604095563139932, "grad_norm": 0.6172951456789394, "learning_rate": 1.1165606884234182e-08, "loss": 0.2514, "step": 359 }, { "epoch": 1.9658703071672354, "grad_norm": 0.6110206348502005, "learning_rate": 8.204113433559202e-09, "loss": 0.2248, "step": 360 }, { "epoch": 1.971331058020478, "grad_norm": 0.5973553403473014, "learning_rate": 5.6977772365857105e-09, "loss": 0.2415, "step": 361 }, { "epoch": 1.9767918088737202, "grad_norm": 0.6109000206055947, "learning_rate": 3.6468268237105364e-09, "loss": 0.2319, "step": 362 }, { "epoch": 1.9822525597269625, "grad_norm": 0.6357125881065677, "learning_rate": 2.0514492027728928e-09, "loss": 0.2457, "step": 363 }, { "epoch": 1.9877133105802047, "grad_norm": 0.669166818877388, "learning_rate": 9.117898419991333e-10, "loss": 0.2574, "step": 364 }, { "epoch": 1.993174061433447, "grad_norm": 0.6418127603308712, "learning_rate": 2.2795265674113721e-10, "loss": 0.2499, "step": 365 }, { "epoch": 1.9986348122866895, "grad_norm": 0.6075592770961785, "learning_rate": 0.0, "loss": 0.2179, "step": 366 }, { "epoch": 1.9986348122866895, "step": 366, "total_flos": 86796684656640.0, "train_loss": 0.2950779539965541, "train_runtime": 2999.324, "train_samples_per_second": 15.628, "train_steps_per_second": 0.122 } ], "logging_steps": 1, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 86796684656640.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }