Qwen2.5-7B-o1-ja-v0.1 / trainer_state.json
Kendamarron's picture
Upload folder using huggingface_hub
5baca55 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9986348122866895,
"eval_steps": 500,
"global_step": 366,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005460750853242321,
"grad_norm": 3.971062381322324,
"learning_rate": 2.702702702702703e-07,
"loss": 0.5016,
"step": 1
},
{
"epoch": 0.010921501706484642,
"grad_norm": 4.14826323264813,
"learning_rate": 5.405405405405406e-07,
"loss": 0.5208,
"step": 2
},
{
"epoch": 0.016382252559726963,
"grad_norm": 3.745507693250097,
"learning_rate": 8.108108108108109e-07,
"loss": 0.4379,
"step": 3
},
{
"epoch": 0.021843003412969283,
"grad_norm": 3.8553920899968372,
"learning_rate": 1.0810810810810812e-06,
"loss": 0.4602,
"step": 4
},
{
"epoch": 0.027303754266211604,
"grad_norm": 4.006658163193665,
"learning_rate": 1.3513513513513515e-06,
"loss": 0.4578,
"step": 5
},
{
"epoch": 0.032764505119453925,
"grad_norm": 3.0693234037079433,
"learning_rate": 1.6216216216216219e-06,
"loss": 0.4416,
"step": 6
},
{
"epoch": 0.03822525597269624,
"grad_norm": 2.3221220798320386,
"learning_rate": 1.8918918918918922e-06,
"loss": 0.4635,
"step": 7
},
{
"epoch": 0.04368600682593857,
"grad_norm": 1.9622941289180669,
"learning_rate": 2.1621621621621623e-06,
"loss": 0.4471,
"step": 8
},
{
"epoch": 0.049146757679180884,
"grad_norm": 1.5785046223376498,
"learning_rate": 2.432432432432433e-06,
"loss": 0.3669,
"step": 9
},
{
"epoch": 0.05460750853242321,
"grad_norm": 1.592167130140165,
"learning_rate": 2.702702702702703e-06,
"loss": 0.4232,
"step": 10
},
{
"epoch": 0.060068259385665526,
"grad_norm": 1.4756853657479083,
"learning_rate": 2.9729729729729736e-06,
"loss": 0.4105,
"step": 11
},
{
"epoch": 0.06552901023890785,
"grad_norm": 1.327708040902143,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.386,
"step": 12
},
{
"epoch": 0.07098976109215017,
"grad_norm": 1.5458703208762807,
"learning_rate": 3.513513513513514e-06,
"loss": 0.405,
"step": 13
},
{
"epoch": 0.07645051194539249,
"grad_norm": 1.6200361209703527,
"learning_rate": 3.7837837837837844e-06,
"loss": 0.3908,
"step": 14
},
{
"epoch": 0.08191126279863481,
"grad_norm": 1.5715327605819764,
"learning_rate": 4.0540540540540545e-06,
"loss": 0.3972,
"step": 15
},
{
"epoch": 0.08737201365187713,
"grad_norm": 1.2301811554389595,
"learning_rate": 4.324324324324325e-06,
"loss": 0.3739,
"step": 16
},
{
"epoch": 0.09283276450511946,
"grad_norm": 1.0413606131616007,
"learning_rate": 4.594594594594596e-06,
"loss": 0.3521,
"step": 17
},
{
"epoch": 0.09829351535836177,
"grad_norm": 1.1239884207253636,
"learning_rate": 4.864864864864866e-06,
"loss": 0.4348,
"step": 18
},
{
"epoch": 0.1037542662116041,
"grad_norm": 1.1123432515923368,
"learning_rate": 5.135135135135135e-06,
"loss": 0.3949,
"step": 19
},
{
"epoch": 0.10921501706484642,
"grad_norm": 1.0996938196641266,
"learning_rate": 5.405405405405406e-06,
"loss": 0.3775,
"step": 20
},
{
"epoch": 0.11467576791808874,
"grad_norm": 1.0868866085505373,
"learning_rate": 5.675675675675676e-06,
"loss": 0.3721,
"step": 21
},
{
"epoch": 0.12013651877133105,
"grad_norm": 1.10559810934531,
"learning_rate": 5.945945945945947e-06,
"loss": 0.3884,
"step": 22
},
{
"epoch": 0.12559726962457338,
"grad_norm": 1.0187684787484814,
"learning_rate": 6.2162162162162164e-06,
"loss": 0.394,
"step": 23
},
{
"epoch": 0.1310580204778157,
"grad_norm": 0.9515401547070604,
"learning_rate": 6.486486486486487e-06,
"loss": 0.3664,
"step": 24
},
{
"epoch": 0.13651877133105803,
"grad_norm": 0.9873275768348283,
"learning_rate": 6.7567567567567575e-06,
"loss": 0.3872,
"step": 25
},
{
"epoch": 0.14197952218430035,
"grad_norm": 0.9420302821261468,
"learning_rate": 7.027027027027028e-06,
"loss": 0.3926,
"step": 26
},
{
"epoch": 0.14744027303754267,
"grad_norm": 0.8628951646680264,
"learning_rate": 7.297297297297298e-06,
"loss": 0.3395,
"step": 27
},
{
"epoch": 0.15290102389078497,
"grad_norm": 0.8883050014456254,
"learning_rate": 7.567567567567569e-06,
"loss": 0.3692,
"step": 28
},
{
"epoch": 0.1583617747440273,
"grad_norm": 0.9314104245334247,
"learning_rate": 7.837837837837838e-06,
"loss": 0.3562,
"step": 29
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.8388999546883599,
"learning_rate": 8.108108108108109e-06,
"loss": 0.3291,
"step": 30
},
{
"epoch": 0.16928327645051194,
"grad_norm": 0.9110394289660935,
"learning_rate": 8.378378378378378e-06,
"loss": 0.3761,
"step": 31
},
{
"epoch": 0.17474402730375427,
"grad_norm": 0.8529619793059433,
"learning_rate": 8.64864864864865e-06,
"loss": 0.3634,
"step": 32
},
{
"epoch": 0.1802047781569966,
"grad_norm": 1.034615680095172,
"learning_rate": 8.91891891891892e-06,
"loss": 0.4073,
"step": 33
},
{
"epoch": 0.18566552901023892,
"grad_norm": 0.9654399340446536,
"learning_rate": 9.189189189189191e-06,
"loss": 0.3832,
"step": 34
},
{
"epoch": 0.19112627986348124,
"grad_norm": 0.8266008406999349,
"learning_rate": 9.45945945945946e-06,
"loss": 0.366,
"step": 35
},
{
"epoch": 0.19658703071672354,
"grad_norm": 1.0298041047732736,
"learning_rate": 9.729729729729732e-06,
"loss": 0.3828,
"step": 36
},
{
"epoch": 0.20204778156996586,
"grad_norm": 1.0253984164765952,
"learning_rate": 1e-05,
"loss": 0.4253,
"step": 37
},
{
"epoch": 0.2075085324232082,
"grad_norm": 0.845565254392157,
"learning_rate": 9.999772047343259e-06,
"loss": 0.3426,
"step": 38
},
{
"epoch": 0.2129692832764505,
"grad_norm": 0.9194984294487474,
"learning_rate": 9.999088210158001e-06,
"loss": 0.343,
"step": 39
},
{
"epoch": 0.21843003412969283,
"grad_norm": 0.8148322661922577,
"learning_rate": 9.997948550797227e-06,
"loss": 0.325,
"step": 40
},
{
"epoch": 0.22389078498293516,
"grad_norm": 0.9623977105015672,
"learning_rate": 9.99635317317629e-06,
"loss": 0.385,
"step": 41
},
{
"epoch": 0.22935153583617748,
"grad_norm": 0.814300333357098,
"learning_rate": 9.994302222763415e-06,
"loss": 0.3462,
"step": 42
},
{
"epoch": 0.2348122866894198,
"grad_norm": 0.8725030955526336,
"learning_rate": 9.991795886566443e-06,
"loss": 0.3401,
"step": 43
},
{
"epoch": 0.2402730375426621,
"grad_norm": 0.9974480242764955,
"learning_rate": 9.988834393115768e-06,
"loss": 0.3424,
"step": 44
},
{
"epoch": 0.24573378839590443,
"grad_norm": 0.8805730275929089,
"learning_rate": 9.98541801244351e-06,
"loss": 0.3742,
"step": 45
},
{
"epoch": 0.25119453924914675,
"grad_norm": 0.8332420506302001,
"learning_rate": 9.981547056058893e-06,
"loss": 0.3435,
"step": 46
},
{
"epoch": 0.2566552901023891,
"grad_norm": 0.9445729244701234,
"learning_rate": 9.977221876919833e-06,
"loss": 0.3442,
"step": 47
},
{
"epoch": 0.2621160409556314,
"grad_norm": 0.859922027597315,
"learning_rate": 9.97244286940076e-06,
"loss": 0.358,
"step": 48
},
{
"epoch": 0.2675767918088737,
"grad_norm": 0.8022442917536148,
"learning_rate": 9.967210469256657e-06,
"loss": 0.3329,
"step": 49
},
{
"epoch": 0.27303754266211605,
"grad_norm": 0.8369993999252197,
"learning_rate": 9.961525153583327e-06,
"loss": 0.3474,
"step": 50
},
{
"epoch": 0.2784982935153584,
"grad_norm": 0.8719055464818419,
"learning_rate": 9.955387440773902e-06,
"loss": 0.3364,
"step": 51
},
{
"epoch": 0.2839590443686007,
"grad_norm": 0.9269845480680101,
"learning_rate": 9.948797890471552e-06,
"loss": 0.3684,
"step": 52
},
{
"epoch": 0.289419795221843,
"grad_norm": 0.8246571303849338,
"learning_rate": 9.94175710351848e-06,
"loss": 0.3564,
"step": 53
},
{
"epoch": 0.29488054607508535,
"grad_norm": 0.9162125135698432,
"learning_rate": 9.93426572190112e-06,
"loss": 0.3526,
"step": 54
},
{
"epoch": 0.3003412969283277,
"grad_norm": 0.9689985766932336,
"learning_rate": 9.926324428691612e-06,
"loss": 0.3825,
"step": 55
},
{
"epoch": 0.30580204778156994,
"grad_norm": 0.9203465649703365,
"learning_rate": 9.917933947985508e-06,
"loss": 0.3492,
"step": 56
},
{
"epoch": 0.31126279863481227,
"grad_norm": 0.810691112658576,
"learning_rate": 9.909095044835755e-06,
"loss": 0.3147,
"step": 57
},
{
"epoch": 0.3167235494880546,
"grad_norm": 0.8980168883992854,
"learning_rate": 9.899808525182935e-06,
"loss": 0.3351,
"step": 58
},
{
"epoch": 0.3221843003412969,
"grad_norm": 0.8843165617295874,
"learning_rate": 9.89007523578178e-06,
"loss": 0.3452,
"step": 59
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.8660715276442186,
"learning_rate": 9.879896064123961e-06,
"loss": 0.3601,
"step": 60
},
{
"epoch": 0.33310580204778156,
"grad_norm": 0.8638898824914902,
"learning_rate": 9.869271938357168e-06,
"loss": 0.3565,
"step": 61
},
{
"epoch": 0.3385665529010239,
"grad_norm": 0.8349466672789928,
"learning_rate": 9.858203827200477e-06,
"loss": 0.3592,
"step": 62
},
{
"epoch": 0.3440273037542662,
"grad_norm": 0.9346433422616252,
"learning_rate": 9.846692739856023e-06,
"loss": 0.3935,
"step": 63
},
{
"epoch": 0.34948805460750854,
"grad_norm": 0.8287634034991234,
"learning_rate": 9.834739725916988e-06,
"loss": 0.3089,
"step": 64
},
{
"epoch": 0.35494880546075086,
"grad_norm": 0.8040217244181859,
"learning_rate": 9.822345875271884e-06,
"loss": 0.313,
"step": 65
},
{
"epoch": 0.3604095563139932,
"grad_norm": 0.8053513263090958,
"learning_rate": 9.80951231800518e-06,
"loss": 0.3355,
"step": 66
},
{
"epoch": 0.3658703071672355,
"grad_norm": 0.7533298814162714,
"learning_rate": 9.79624022429427e-06,
"loss": 0.3067,
"step": 67
},
{
"epoch": 0.37133105802047783,
"grad_norm": 0.9386782501271983,
"learning_rate": 9.782530804302763e-06,
"loss": 0.3593,
"step": 68
},
{
"epoch": 0.37679180887372016,
"grad_norm": 0.8507056335702303,
"learning_rate": 9.768385308070139e-06,
"loss": 0.3629,
"step": 69
},
{
"epoch": 0.3822525597269625,
"grad_norm": 0.782049564136347,
"learning_rate": 9.75380502539778e-06,
"loss": 0.3458,
"step": 70
},
{
"epoch": 0.38771331058020475,
"grad_norm": 0.9109652113851044,
"learning_rate": 9.738791285731353e-06,
"loss": 0.348,
"step": 71
},
{
"epoch": 0.3931740614334471,
"grad_norm": 0.8457379953081087,
"learning_rate": 9.723345458039595e-06,
"loss": 0.3701,
"step": 72
},
{
"epoch": 0.3986348122866894,
"grad_norm": 0.8354411100444213,
"learning_rate": 9.70746895068949e-06,
"loss": 0.3453,
"step": 73
},
{
"epoch": 0.4040955631399317,
"grad_norm": 0.7824544054631952,
"learning_rate": 9.691163211317853e-06,
"loss": 0.3393,
"step": 74
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.7752036290890001,
"learning_rate": 9.674429726699324e-06,
"loss": 0.3121,
"step": 75
},
{
"epoch": 0.4150170648464164,
"grad_norm": 0.9047037383020493,
"learning_rate": 9.657270022610814e-06,
"loss": 0.3507,
"step": 76
},
{
"epoch": 0.4204778156996587,
"grad_norm": 0.8453635648693023,
"learning_rate": 9.63968566369238e-06,
"loss": 0.3641,
"step": 77
},
{
"epoch": 0.425938566552901,
"grad_norm": 0.8290743120901927,
"learning_rate": 9.62167825330455e-06,
"loss": 0.3739,
"step": 78
},
{
"epoch": 0.43139931740614335,
"grad_norm": 0.8977215293449932,
"learning_rate": 9.603249433382145e-06,
"loss": 0.3185,
"step": 79
},
{
"epoch": 0.43686006825938567,
"grad_norm": 0.9078617748361664,
"learning_rate": 9.584400884284546e-06,
"loss": 0.3415,
"step": 80
},
{
"epoch": 0.442320819112628,
"grad_norm": 0.8589830385419883,
"learning_rate": 9.565134324642491e-06,
"loss": 0.3331,
"step": 81
},
{
"epoch": 0.4477815699658703,
"grad_norm": 0.804380018393787,
"learning_rate": 9.545451511201365e-06,
"loss": 0.322,
"step": 82
},
{
"epoch": 0.45324232081911264,
"grad_norm": 0.8685230840996425,
"learning_rate": 9.52535423866101e-06,
"loss": 0.3476,
"step": 83
},
{
"epoch": 0.45870307167235497,
"grad_norm": 0.9643956240091752,
"learning_rate": 9.504844339512096e-06,
"loss": 0.3671,
"step": 84
},
{
"epoch": 0.4641638225255973,
"grad_norm": 0.8997894029115073,
"learning_rate": 9.483923683869025e-06,
"loss": 0.352,
"step": 85
},
{
"epoch": 0.4696245733788396,
"grad_norm": 0.9409163478885427,
"learning_rate": 9.462594179299408e-06,
"loss": 0.3533,
"step": 86
},
{
"epoch": 0.4750853242320819,
"grad_norm": 1.0349789755076704,
"learning_rate": 9.440857770650139e-06,
"loss": 0.3501,
"step": 87
},
{
"epoch": 0.4805460750853242,
"grad_norm": 0.7719492270463393,
"learning_rate": 9.418716439870056e-06,
"loss": 0.3092,
"step": 88
},
{
"epoch": 0.48600682593856653,
"grad_norm": 0.9082886643398166,
"learning_rate": 9.396172205829235e-06,
"loss": 0.3514,
"step": 89
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.8358389654564478,
"learning_rate": 9.373227124134888e-06,
"loss": 0.3489,
"step": 90
},
{
"epoch": 0.4969283276450512,
"grad_norm": 0.8974753686960236,
"learning_rate": 9.349883286943951e-06,
"loss": 0.3632,
"step": 91
},
{
"epoch": 0.5023890784982935,
"grad_norm": 0.8827201059716774,
"learning_rate": 9.326142822772301e-06,
"loss": 0.3584,
"step": 92
},
{
"epoch": 0.5078498293515359,
"grad_norm": 0.813182991570662,
"learning_rate": 9.302007896300697e-06,
"loss": 0.3591,
"step": 93
},
{
"epoch": 0.5133105802047782,
"grad_norm": 0.7442842781039997,
"learning_rate": 9.27748070817738e-06,
"loss": 0.3143,
"step": 94
},
{
"epoch": 0.5187713310580204,
"grad_norm": 0.906866423901588,
"learning_rate": 9.252563494817426e-06,
"loss": 0.3772,
"step": 95
},
{
"epoch": 0.5242320819112628,
"grad_norm": 0.7894206448318375,
"learning_rate": 9.227258528198832e-06,
"loss": 0.3131,
"step": 96
},
{
"epoch": 0.5296928327645051,
"grad_norm": 0.8009536933279702,
"learning_rate": 9.201568115655343e-06,
"loss": 0.329,
"step": 97
},
{
"epoch": 0.5351535836177475,
"grad_norm": 0.8048929927509286,
"learning_rate": 9.175494599666078e-06,
"loss": 0.3278,
"step": 98
},
{
"epoch": 0.5406143344709897,
"grad_norm": 0.814222453793431,
"learning_rate": 9.14904035764193e-06,
"loss": 0.3225,
"step": 99
},
{
"epoch": 0.5460750853242321,
"grad_norm": 0.8732367458543802,
"learning_rate": 9.122207801708802e-06,
"loss": 0.3524,
"step": 100
},
{
"epoch": 0.5515358361774744,
"grad_norm": 0.8134905761183274,
"learning_rate": 9.094999378487659e-06,
"loss": 0.3546,
"step": 101
},
{
"epoch": 0.5569965870307167,
"grad_norm": 0.8567463415353727,
"learning_rate": 9.067417568871444e-06,
"loss": 0.3548,
"step": 102
},
{
"epoch": 0.562457337883959,
"grad_norm": 0.8151562254810784,
"learning_rate": 9.03946488779887e-06,
"loss": 0.3439,
"step": 103
},
{
"epoch": 0.5679180887372014,
"grad_norm": 0.8746438505757359,
"learning_rate": 9.0111438840251e-06,
"loss": 0.3242,
"step": 104
},
{
"epoch": 0.5733788395904437,
"grad_norm": 0.8085121266810896,
"learning_rate": 8.982457139889358e-06,
"loss": 0.3598,
"step": 105
},
{
"epoch": 0.578839590443686,
"grad_norm": 0.8191769217039168,
"learning_rate": 8.953407271079456e-06,
"loss": 0.3425,
"step": 106
},
{
"epoch": 0.5843003412969283,
"grad_norm": 0.874872463262842,
"learning_rate": 8.923996926393306e-06,
"loss": 0.3795,
"step": 107
},
{
"epoch": 0.5897610921501707,
"grad_norm": 0.8469769243731713,
"learning_rate": 8.894228787497389e-06,
"loss": 0.3555,
"step": 108
},
{
"epoch": 0.595221843003413,
"grad_norm": 0.7907533312057188,
"learning_rate": 8.864105568682245e-06,
"loss": 0.3425,
"step": 109
},
{
"epoch": 0.6006825938566553,
"grad_norm": 0.9105392675920642,
"learning_rate": 8.833630016614976e-06,
"loss": 0.3214,
"step": 110
},
{
"epoch": 0.6061433447098976,
"grad_norm": 0.7743632593675985,
"learning_rate": 8.80280491008881e-06,
"loss": 0.3477,
"step": 111
},
{
"epoch": 0.6116040955631399,
"grad_norm": 0.9007334854740756,
"learning_rate": 8.771633059769712e-06,
"loss": 0.3836,
"step": 112
},
{
"epoch": 0.6170648464163823,
"grad_norm": 0.810760704066922,
"learning_rate": 8.740117307940123e-06,
"loss": 0.3397,
"step": 113
},
{
"epoch": 0.6225255972696245,
"grad_norm": 0.9072711750424595,
"learning_rate": 8.708260528239788e-06,
"loss": 0.3389,
"step": 114
},
{
"epoch": 0.6279863481228669,
"grad_norm": 0.8621760047744049,
"learning_rate": 8.676065625403733e-06,
"loss": 0.3788,
"step": 115
},
{
"epoch": 0.6334470989761092,
"grad_norm": 0.8388020553913726,
"learning_rate": 8.64353553499741e-06,
"loss": 0.3274,
"step": 116
},
{
"epoch": 0.6389078498293516,
"grad_norm": 0.8591445453801212,
"learning_rate": 8.610673223149036e-06,
"loss": 0.3598,
"step": 117
},
{
"epoch": 0.6443686006825938,
"grad_norm": 0.8057297251815362,
"learning_rate": 8.577481686279123e-06,
"loss": 0.3522,
"step": 118
},
{
"epoch": 0.6498293515358362,
"grad_norm": 0.779914515334107,
"learning_rate": 8.543963950827279e-06,
"loss": 0.3416,
"step": 119
},
{
"epoch": 0.6552901023890785,
"grad_norm": 0.8241347234199242,
"learning_rate": 8.51012307297624e-06,
"loss": 0.341,
"step": 120
},
{
"epoch": 0.6607508532423209,
"grad_norm": 0.7674873201219691,
"learning_rate": 8.475962138373212e-06,
"loss": 0.3268,
"step": 121
},
{
"epoch": 0.6662116040955631,
"grad_norm": 0.7983877124268901,
"learning_rate": 8.441484261848514e-06,
"loss": 0.3744,
"step": 122
},
{
"epoch": 0.6716723549488055,
"grad_norm": 0.924444516956386,
"learning_rate": 8.406692587131569e-06,
"loss": 0.341,
"step": 123
},
{
"epoch": 0.6771331058020478,
"grad_norm": 0.7648013324474998,
"learning_rate": 8.371590286564247e-06,
"loss": 0.3239,
"step": 124
},
{
"epoch": 0.6825938566552902,
"grad_norm": 0.8275858574184091,
"learning_rate": 8.336180560811619e-06,
"loss": 0.3588,
"step": 125
},
{
"epoch": 0.6880546075085324,
"grad_norm": 0.7874924257335151,
"learning_rate": 8.30046663857011e-06,
"loss": 0.3431,
"step": 126
},
{
"epoch": 0.6935153583617747,
"grad_norm": 0.8745641415217126,
"learning_rate": 8.264451776273104e-06,
"loss": 0.3489,
"step": 127
},
{
"epoch": 0.6989761092150171,
"grad_norm": 0.8858812955767805,
"learning_rate": 8.228139257794012e-06,
"loss": 0.3595,
"step": 128
},
{
"epoch": 0.7044368600682593,
"grad_norm": 0.812177330348684,
"learning_rate": 8.191532394146865e-06,
"loss": 0.328,
"step": 129
},
{
"epoch": 0.7098976109215017,
"grad_norm": 0.7755088132854933,
"learning_rate": 8.154634523184389e-06,
"loss": 0.3392,
"step": 130
},
{
"epoch": 0.715358361774744,
"grad_norm": 0.8715295143660003,
"learning_rate": 8.117449009293668e-06,
"loss": 0.3482,
"step": 131
},
{
"epoch": 0.7208191126279864,
"grad_norm": 0.7737148258150855,
"learning_rate": 8.07997924308938e-06,
"loss": 0.3258,
"step": 132
},
{
"epoch": 0.7262798634812286,
"grad_norm": 0.7616464397737633,
"learning_rate": 8.042228641104622e-06,
"loss": 0.3164,
"step": 133
},
{
"epoch": 0.731740614334471,
"grad_norm": 0.7706843428925245,
"learning_rate": 8.004200645479403e-06,
"loss": 0.3267,
"step": 134
},
{
"epoch": 0.7372013651877133,
"grad_norm": 0.807087784184599,
"learning_rate": 7.965898723646777e-06,
"loss": 0.3556,
"step": 135
},
{
"epoch": 0.7426621160409557,
"grad_norm": 0.8571787444948499,
"learning_rate": 7.927326368016677e-06,
"loss": 0.349,
"step": 136
},
{
"epoch": 0.7481228668941979,
"grad_norm": 0.758611440956407,
"learning_rate": 7.888487095657484e-06,
"loss": 0.3301,
"step": 137
},
{
"epoch": 0.7535836177474403,
"grad_norm": 0.7924167608304931,
"learning_rate": 7.849384447975322e-06,
"loss": 0.3534,
"step": 138
},
{
"epoch": 0.7590443686006826,
"grad_norm": 0.8750460208633537,
"learning_rate": 7.810021990391163e-06,
"loss": 0.3405,
"step": 139
},
{
"epoch": 0.764505119453925,
"grad_norm": 0.7895037781717571,
"learning_rate": 7.77040331201572e-06,
"loss": 0.3678,
"step": 140
},
{
"epoch": 0.7699658703071672,
"grad_norm": 0.8223142510608592,
"learning_rate": 7.73053202532219e-06,
"loss": 0.3469,
"step": 141
},
{
"epoch": 0.7754266211604095,
"grad_norm": 0.8172036706667312,
"learning_rate": 7.690411765816864e-06,
"loss": 0.3395,
"step": 142
},
{
"epoch": 0.7808873720136519,
"grad_norm": 0.7717521340469524,
"learning_rate": 7.650046191707641e-06,
"loss": 0.3352,
"step": 143
},
{
"epoch": 0.7863481228668942,
"grad_norm": 0.8745843729938327,
"learning_rate": 7.609438983570461e-06,
"loss": 0.34,
"step": 144
},
{
"epoch": 0.7918088737201365,
"grad_norm": 0.8462879664073518,
"learning_rate": 7.5685938440137185e-06,
"loss": 0.3434,
"step": 145
},
{
"epoch": 0.7972696245733788,
"grad_norm": 0.8887194246240154,
"learning_rate": 7.527514497340642e-06,
"loss": 0.3536,
"step": 146
},
{
"epoch": 0.8027303754266212,
"grad_norm": 0.720734982965855,
"learning_rate": 7.486204689209719e-06,
"loss": 0.3071,
"step": 147
},
{
"epoch": 0.8081911262798634,
"grad_norm": 0.7852180915891143,
"learning_rate": 7.444668186293153e-06,
"loss": 0.3318,
"step": 148
},
{
"epoch": 0.8136518771331058,
"grad_norm": 0.8169236397844766,
"learning_rate": 7.402908775933419e-06,
"loss": 0.3282,
"step": 149
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.8409146266167947,
"learning_rate": 7.360930265797934e-06,
"loss": 0.3592,
"step": 150
},
{
"epoch": 0.8245733788395905,
"grad_norm": 0.7893736430445095,
"learning_rate": 7.318736483531861e-06,
"loss": 0.3455,
"step": 151
},
{
"epoch": 0.8300341296928327,
"grad_norm": 0.7092487578490618,
"learning_rate": 7.2763312764091055e-06,
"loss": 0.307,
"step": 152
},
{
"epoch": 0.8354948805460751,
"grad_norm": 0.7643841671055314,
"learning_rate": 7.23371851098152e-06,
"loss": 0.3104,
"step": 153
},
{
"epoch": 0.8409556313993174,
"grad_norm": 0.8743703462981528,
"learning_rate": 7.190902072726336e-06,
"loss": 0.3601,
"step": 154
},
{
"epoch": 0.8464163822525598,
"grad_norm": 0.8748161240027253,
"learning_rate": 7.147885865691899e-06,
"loss": 0.3592,
"step": 155
},
{
"epoch": 0.851877133105802,
"grad_norm": 0.6528952892311825,
"learning_rate": 7.104673812141676e-06,
"loss": 0.2919,
"step": 156
},
{
"epoch": 0.8573378839590444,
"grad_norm": 0.8161745547126792,
"learning_rate": 7.061269852196633e-06,
"loss": 0.345,
"step": 157
},
{
"epoch": 0.8627986348122867,
"grad_norm": 0.8321903783865391,
"learning_rate": 7.017677943475962e-06,
"loss": 0.321,
"step": 158
},
{
"epoch": 0.868259385665529,
"grad_norm": 0.83313681444351,
"learning_rate": 6.973902060736226e-06,
"loss": 0.3435,
"step": 159
},
{
"epoch": 0.8737201365187713,
"grad_norm": 0.7505151585539925,
"learning_rate": 6.929946195508933e-06,
"loss": 0.3163,
"step": 160
},
{
"epoch": 0.8791808873720136,
"grad_norm": 0.7304802524364322,
"learning_rate": 6.8858143557365865e-06,
"loss": 0.328,
"step": 161
},
{
"epoch": 0.884641638225256,
"grad_norm": 0.8143420713928606,
"learning_rate": 6.841510565407235e-06,
"loss": 0.3341,
"step": 162
},
{
"epoch": 0.8901023890784983,
"grad_norm": 0.7567204344075086,
"learning_rate": 6.797038864187564e-06,
"loss": 0.3059,
"step": 163
},
{
"epoch": 0.8955631399317406,
"grad_norm": 0.7826567782778101,
"learning_rate": 6.752403307054549e-06,
"loss": 0.3283,
"step": 164
},
{
"epoch": 0.9010238907849829,
"grad_norm": 0.7886900433942758,
"learning_rate": 6.707607963925725e-06,
"loss": 0.3592,
"step": 165
},
{
"epoch": 0.9064846416382253,
"grad_norm": 0.820709571232716,
"learning_rate": 6.66265691928808e-06,
"loss": 0.3605,
"step": 166
},
{
"epoch": 0.9119453924914676,
"grad_norm": 0.7681789982648866,
"learning_rate": 6.617554271825636e-06,
"loss": 0.3051,
"step": 167
},
{
"epoch": 0.9174061433447099,
"grad_norm": 0.8006459558215293,
"learning_rate": 6.5723041340457175e-06,
"loss": 0.3542,
"step": 168
},
{
"epoch": 0.9228668941979522,
"grad_norm": 0.7333102829214887,
"learning_rate": 6.526910631903973e-06,
"loss": 0.3254,
"step": 169
},
{
"epoch": 0.9283276450511946,
"grad_norm": 0.7766899671870917,
"learning_rate": 6.481377904428171e-06,
"loss": 0.3297,
"step": 170
},
{
"epoch": 0.9337883959044369,
"grad_norm": 0.8887532080533157,
"learning_rate": 6.435710103340787e-06,
"loss": 0.3531,
"step": 171
},
{
"epoch": 0.9392491467576792,
"grad_norm": 0.7606421967689092,
"learning_rate": 6.3899113926804565e-06,
"loss": 0.3279,
"step": 172
},
{
"epoch": 0.9447098976109215,
"grad_norm": 0.7894203946388427,
"learning_rate": 6.3439859484222874e-06,
"loss": 0.3206,
"step": 173
},
{
"epoch": 0.9501706484641638,
"grad_norm": 0.8106143896629081,
"learning_rate": 6.297937958097094e-06,
"loss": 0.3185,
"step": 174
},
{
"epoch": 0.9556313993174061,
"grad_norm": 0.7673331407317434,
"learning_rate": 6.251771620409563e-06,
"loss": 0.3408,
"step": 175
},
{
"epoch": 0.9610921501706484,
"grad_norm": 0.7678720102410665,
"learning_rate": 6.205491144855432e-06,
"loss": 0.3388,
"step": 176
},
{
"epoch": 0.9665529010238908,
"grad_norm": 0.8058357314804626,
"learning_rate": 6.1591007513376425e-06,
"loss": 0.348,
"step": 177
},
{
"epoch": 0.9720136518771331,
"grad_norm": 0.7150290944167804,
"learning_rate": 6.112604669781572e-06,
"loss": 0.3187,
"step": 178
},
{
"epoch": 0.9774744027303754,
"grad_norm": 0.7724885943742522,
"learning_rate": 6.066007139749351e-06,
"loss": 0.3112,
"step": 179
},
{
"epoch": 0.9829351535836177,
"grad_norm": 0.7079636144073458,
"learning_rate": 6.019312410053286e-06,
"loss": 0.3115,
"step": 180
},
{
"epoch": 0.9883959044368601,
"grad_norm": 0.7145124027416198,
"learning_rate": 5.972524738368452e-06,
"loss": 0.3015,
"step": 181
},
{
"epoch": 0.9938566552901024,
"grad_norm": 0.7747190577463166,
"learning_rate": 5.925648390844476e-06,
"loss": 0.3405,
"step": 182
},
{
"epoch": 0.9993174061433447,
"grad_norm": 0.7411495697672651,
"learning_rate": 5.878687641716539e-06,
"loss": 0.3241,
"step": 183
},
{
"epoch": 1.004778156996587,
"grad_norm": 2.2845026949908367,
"learning_rate": 5.831646772915651e-06,
"loss": 0.5887,
"step": 184
},
{
"epoch": 1.0102389078498293,
"grad_norm": 0.767067987662548,
"learning_rate": 5.7845300736782205e-06,
"loss": 0.2696,
"step": 185
},
{
"epoch": 1.0156996587030718,
"grad_norm": 0.6444558204539116,
"learning_rate": 5.7373418401549565e-06,
"loss": 0.2179,
"step": 186
},
{
"epoch": 1.021160409556314,
"grad_norm": 0.6326875020427418,
"learning_rate": 5.690086375019135e-06,
"loss": 0.2063,
"step": 187
},
{
"epoch": 1.0266211604095563,
"grad_norm": 0.6386737207560813,
"learning_rate": 5.642767987074288e-06,
"loss": 0.2395,
"step": 188
},
{
"epoch": 1.0320819112627986,
"grad_norm": 0.7191360204333792,
"learning_rate": 5.595390990861311e-06,
"loss": 0.2593,
"step": 189
},
{
"epoch": 1.0375426621160408,
"grad_norm": 0.7139153218951271,
"learning_rate": 5.547959706265068e-06,
"loss": 0.25,
"step": 190
},
{
"epoch": 1.0430034129692833,
"grad_norm": 0.7132174285440807,
"learning_rate": 5.500478458120493e-06,
"loss": 0.2656,
"step": 191
},
{
"epoch": 1.0484641638225256,
"grad_norm": 0.7373698973683902,
"learning_rate": 5.45295157581825e-06,
"loss": 0.2643,
"step": 192
},
{
"epoch": 1.0539249146757679,
"grad_norm": 0.6806301830473086,
"learning_rate": 5.405383392909973e-06,
"loss": 0.2521,
"step": 193
},
{
"epoch": 1.0593856655290101,
"grad_norm": 0.6785558579468979,
"learning_rate": 5.357778246713131e-06,
"loss": 0.254,
"step": 194
},
{
"epoch": 1.0648464163822526,
"grad_norm": 0.6701400051635917,
"learning_rate": 5.310140477915544e-06,
"loss": 0.2303,
"step": 195
},
{
"epoch": 1.070307167235495,
"grad_norm": 0.7408218245910705,
"learning_rate": 5.262474430179597e-06,
"loss": 0.2587,
"step": 196
},
{
"epoch": 1.0757679180887372,
"grad_norm": 0.652029496994987,
"learning_rate": 5.2147844497461745e-06,
"loss": 0.2201,
"step": 197
},
{
"epoch": 1.0812286689419794,
"grad_norm": 0.6145756306559864,
"learning_rate": 5.1670748850383734e-06,
"loss": 0.2131,
"step": 198
},
{
"epoch": 1.086689419795222,
"grad_norm": 0.6166424191446999,
"learning_rate": 5.1193500862650045e-06,
"loss": 0.2272,
"step": 199
},
{
"epoch": 1.0921501706484642,
"grad_norm": 0.6183167944122974,
"learning_rate": 5.071614405023938e-06,
"loss": 0.2239,
"step": 200
},
{
"epoch": 1.0976109215017065,
"grad_norm": 0.7073017446193981,
"learning_rate": 5.023872193905316e-06,
"loss": 0.2564,
"step": 201
},
{
"epoch": 1.1030716723549487,
"grad_norm": 0.6785368103283103,
"learning_rate": 4.976127806094685e-06,
"loss": 0.2598,
"step": 202
},
{
"epoch": 1.108532423208191,
"grad_norm": 0.6686099383276679,
"learning_rate": 4.928385594976063e-06,
"loss": 0.2391,
"step": 203
},
{
"epoch": 1.1139931740614335,
"grad_norm": 0.6046536649329635,
"learning_rate": 4.880649913734996e-06,
"loss": 0.2111,
"step": 204
},
{
"epoch": 1.1194539249146758,
"grad_norm": 0.6455972829075776,
"learning_rate": 4.832925114961629e-06,
"loss": 0.2291,
"step": 205
},
{
"epoch": 1.124914675767918,
"grad_norm": 0.6294601922178525,
"learning_rate": 4.785215550253826e-06,
"loss": 0.2237,
"step": 206
},
{
"epoch": 1.1303754266211605,
"grad_norm": 0.6539972986726327,
"learning_rate": 4.737525569820405e-06,
"loss": 0.2415,
"step": 207
},
{
"epoch": 1.1358361774744028,
"grad_norm": 0.6841776523547041,
"learning_rate": 4.689859522084457e-06,
"loss": 0.2573,
"step": 208
},
{
"epoch": 1.141296928327645,
"grad_norm": 0.708275852733329,
"learning_rate": 4.64222175328687e-06,
"loss": 0.2535,
"step": 209
},
{
"epoch": 1.1467576791808873,
"grad_norm": 0.6327858698732379,
"learning_rate": 4.594616607090028e-06,
"loss": 0.2284,
"step": 210
},
{
"epoch": 1.1522184300341296,
"grad_norm": 0.6408257648532151,
"learning_rate": 4.547048424181751e-06,
"loss": 0.2294,
"step": 211
},
{
"epoch": 1.157679180887372,
"grad_norm": 0.6159123552870842,
"learning_rate": 4.499521541879508e-06,
"loss": 0.2226,
"step": 212
},
{
"epoch": 1.1631399317406144,
"grad_norm": 0.5823300781202381,
"learning_rate": 4.452040293734934e-06,
"loss": 0.2108,
"step": 213
},
{
"epoch": 1.1686006825938566,
"grad_norm": 0.6041391928150867,
"learning_rate": 4.40460900913869e-06,
"loss": 0.2224,
"step": 214
},
{
"epoch": 1.174061433447099,
"grad_norm": 0.6641306892375002,
"learning_rate": 4.357232012925714e-06,
"loss": 0.2384,
"step": 215
},
{
"epoch": 1.1795221843003414,
"grad_norm": 0.6503207204016519,
"learning_rate": 4.309913624980866e-06,
"loss": 0.2347,
"step": 216
},
{
"epoch": 1.1849829351535837,
"grad_norm": 0.62805580635999,
"learning_rate": 4.262658159845046e-06,
"loss": 0.229,
"step": 217
},
{
"epoch": 1.190443686006826,
"grad_norm": 0.6275617918722145,
"learning_rate": 4.2154699263217794e-06,
"loss": 0.2286,
"step": 218
},
{
"epoch": 1.1959044368600682,
"grad_norm": 0.7617460871871701,
"learning_rate": 4.1683532270843505e-06,
"loss": 0.2574,
"step": 219
},
{
"epoch": 1.2013651877133107,
"grad_norm": 0.6140110702778818,
"learning_rate": 4.121312358283464e-06,
"loss": 0.2149,
"step": 220
},
{
"epoch": 1.206825938566553,
"grad_norm": 0.6247259244040378,
"learning_rate": 4.074351609155527e-06,
"loss": 0.2381,
"step": 221
},
{
"epoch": 1.2122866894197952,
"grad_norm": 0.6591171660703798,
"learning_rate": 4.0274752616315485e-06,
"loss": 0.2344,
"step": 222
},
{
"epoch": 1.2177474402730375,
"grad_norm": 0.6436293629709356,
"learning_rate": 3.980687589946715e-06,
"loss": 0.2319,
"step": 223
},
{
"epoch": 1.2232081911262798,
"grad_norm": 0.7670288305294722,
"learning_rate": 3.9339928602506505e-06,
"loss": 0.2497,
"step": 224
},
{
"epoch": 1.2286689419795223,
"grad_norm": 0.6962942427446093,
"learning_rate": 3.887395330218429e-06,
"loss": 0.2336,
"step": 225
},
{
"epoch": 1.2341296928327645,
"grad_norm": 0.6965676226191562,
"learning_rate": 3.840899248662358e-06,
"loss": 0.2552,
"step": 226
},
{
"epoch": 1.2395904436860068,
"grad_norm": 0.6641042493963545,
"learning_rate": 3.7945088551445698e-06,
"loss": 0.2563,
"step": 227
},
{
"epoch": 1.245051194539249,
"grad_norm": 0.6196955536234605,
"learning_rate": 3.748228379590438e-06,
"loss": 0.2291,
"step": 228
},
{
"epoch": 1.2505119453924913,
"grad_norm": 0.6278753799486634,
"learning_rate": 3.7020620419029095e-06,
"loss": 0.2141,
"step": 229
},
{
"epoch": 1.2559726962457338,
"grad_norm": 0.6082644009007588,
"learning_rate": 3.656014051577713e-06,
"loss": 0.2122,
"step": 230
},
{
"epoch": 1.261433447098976,
"grad_norm": 0.6736359984841271,
"learning_rate": 3.610088607319544e-06,
"loss": 0.2367,
"step": 231
},
{
"epoch": 1.2668941979522184,
"grad_norm": 0.6329301932656438,
"learning_rate": 3.5642898966592145e-06,
"loss": 0.235,
"step": 232
},
{
"epoch": 1.2723549488054609,
"grad_norm": 0.619963960271499,
"learning_rate": 3.518622095571831e-06,
"loss": 0.2208,
"step": 233
},
{
"epoch": 1.2778156996587031,
"grad_norm": 0.6693990598652739,
"learning_rate": 3.4730893680960267e-06,
"loss": 0.2406,
"step": 234
},
{
"epoch": 1.2832764505119454,
"grad_norm": 0.6864592317171182,
"learning_rate": 3.4276958659542838e-06,
"loss": 0.243,
"step": 235
},
{
"epoch": 1.2887372013651877,
"grad_norm": 0.7236681291816511,
"learning_rate": 3.382445728174365e-06,
"loss": 0.2586,
"step": 236
},
{
"epoch": 1.29419795221843,
"grad_norm": 0.6176752667693888,
"learning_rate": 3.3373430807119212e-06,
"loss": 0.2251,
"step": 237
},
{
"epoch": 1.2996587030716724,
"grad_norm": 0.7022262485772638,
"learning_rate": 3.292392036074277e-06,
"loss": 0.2316,
"step": 238
},
{
"epoch": 1.3051194539249147,
"grad_norm": 0.6404240889042992,
"learning_rate": 3.2475966929454505e-06,
"loss": 0.2384,
"step": 239
},
{
"epoch": 1.310580204778157,
"grad_norm": 0.7080127357360425,
"learning_rate": 3.202961135812437e-06,
"loss": 0.248,
"step": 240
},
{
"epoch": 1.3160409556313994,
"grad_norm": 0.668638034601711,
"learning_rate": 3.1584894345927663e-06,
"loss": 0.2212,
"step": 241
},
{
"epoch": 1.3215017064846417,
"grad_norm": 0.6729621818012642,
"learning_rate": 3.114185644263415e-06,
"loss": 0.222,
"step": 242
},
{
"epoch": 1.326962457337884,
"grad_norm": 0.6723316181938683,
"learning_rate": 3.0700538044910684e-06,
"loss": 0.2246,
"step": 243
},
{
"epoch": 1.3324232081911263,
"grad_norm": 0.6671061425745013,
"learning_rate": 3.0260979392637753e-06,
"loss": 0.2518,
"step": 244
},
{
"epoch": 1.3378839590443685,
"grad_norm": 0.5962051816320753,
"learning_rate": 2.9823220565240396e-06,
"loss": 0.2224,
"step": 245
},
{
"epoch": 1.343344709897611,
"grad_norm": 0.6511243444073086,
"learning_rate": 2.9387301478033694e-06,
"loss": 0.2521,
"step": 246
},
{
"epoch": 1.3488054607508533,
"grad_norm": 0.6112346949791394,
"learning_rate": 2.8953261878583263e-06,
"loss": 0.2164,
"step": 247
},
{
"epoch": 1.3542662116040955,
"grad_norm": 0.6844064518509092,
"learning_rate": 2.852114134308104e-06,
"loss": 0.2532,
"step": 248
},
{
"epoch": 1.3597269624573378,
"grad_norm": 0.6383549765315465,
"learning_rate": 2.8090979272736663e-06,
"loss": 0.2401,
"step": 249
},
{
"epoch": 1.36518771331058,
"grad_norm": 0.5944587646093914,
"learning_rate": 2.766281489018482e-06,
"loss": 0.2293,
"step": 250
},
{
"epoch": 1.3706484641638226,
"grad_norm": 0.6307687611402706,
"learning_rate": 2.7236687235908953e-06,
"loss": 0.2188,
"step": 251
},
{
"epoch": 1.3761092150170648,
"grad_norm": 0.6238745929037437,
"learning_rate": 2.681263516468139e-06,
"loss": 0.2475,
"step": 252
},
{
"epoch": 1.3815699658703071,
"grad_norm": 0.6373763499985442,
"learning_rate": 2.6390697342020665e-06,
"loss": 0.2343,
"step": 253
},
{
"epoch": 1.3870307167235496,
"grad_norm": 0.6734367954708225,
"learning_rate": 2.5970912240665815e-06,
"loss": 0.2353,
"step": 254
},
{
"epoch": 1.3924914675767919,
"grad_norm": 0.6582720475674197,
"learning_rate": 2.5553318137068473e-06,
"loss": 0.2474,
"step": 255
},
{
"epoch": 1.3979522184300341,
"grad_norm": 0.6052116695135601,
"learning_rate": 2.5137953107902814e-06,
"loss": 0.2322,
"step": 256
},
{
"epoch": 1.4034129692832764,
"grad_norm": 0.6552372504854818,
"learning_rate": 2.472485502659358e-06,
"loss": 0.2468,
"step": 257
},
{
"epoch": 1.4088737201365187,
"grad_norm": 0.6302233591025854,
"learning_rate": 2.4314061559862836e-06,
"loss": 0.2398,
"step": 258
},
{
"epoch": 1.4143344709897612,
"grad_norm": 0.654084080401935,
"learning_rate": 2.3905610164295394e-06,
"loss": 0.2329,
"step": 259
},
{
"epoch": 1.4197952218430034,
"grad_norm": 0.6485524489438387,
"learning_rate": 2.3499538082923607e-06,
"loss": 0.2446,
"step": 260
},
{
"epoch": 1.4252559726962457,
"grad_norm": 0.6260148485778105,
"learning_rate": 2.309588234183137e-06,
"loss": 0.215,
"step": 261
},
{
"epoch": 1.430716723549488,
"grad_norm": 0.6165139801898837,
"learning_rate": 2.2694679746778116e-06,
"loss": 0.2235,
"step": 262
},
{
"epoch": 1.4361774744027302,
"grad_norm": 0.6073663452431178,
"learning_rate": 2.22959668798428e-06,
"loss": 0.21,
"step": 263
},
{
"epoch": 1.4416382252559727,
"grad_norm": 0.6687068934985456,
"learning_rate": 2.1899780096088375e-06,
"loss": 0.2609,
"step": 264
},
{
"epoch": 1.447098976109215,
"grad_norm": 0.5999286753849784,
"learning_rate": 2.1506155520246795e-06,
"loss": 0.2275,
"step": 265
},
{
"epoch": 1.4525597269624573,
"grad_norm": 0.6562378405208374,
"learning_rate": 2.1115129043425188e-06,
"loss": 0.2577,
"step": 266
},
{
"epoch": 1.4580204778156998,
"grad_norm": 0.6564390690276799,
"learning_rate": 2.072673631983323e-06,
"loss": 0.2583,
"step": 267
},
{
"epoch": 1.463481228668942,
"grad_norm": 0.6175405413313049,
"learning_rate": 2.0341012763532243e-06,
"loss": 0.2252,
"step": 268
},
{
"epoch": 1.4689419795221843,
"grad_norm": 0.6294576454771881,
"learning_rate": 1.995799354520598e-06,
"loss": 0.2282,
"step": 269
},
{
"epoch": 1.4744027303754266,
"grad_norm": 0.6491408881404807,
"learning_rate": 1.9577713588953797e-06,
"loss": 0.2204,
"step": 270
},
{
"epoch": 1.4798634812286688,
"grad_norm": 0.626492440911862,
"learning_rate": 1.9200207569106216e-06,
"loss": 0.2363,
"step": 271
},
{
"epoch": 1.4853242320819113,
"grad_norm": 0.6328542452711655,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.2312,
"step": 272
},
{
"epoch": 1.4907849829351536,
"grad_norm": 0.6502462448470019,
"learning_rate": 1.8453654768156138e-06,
"loss": 0.2512,
"step": 273
},
{
"epoch": 1.4962457337883959,
"grad_norm": 0.6012861830171234,
"learning_rate": 1.8084676058531376e-06,
"loss": 0.2285,
"step": 274
},
{
"epoch": 1.5017064846416384,
"grad_norm": 0.6834276368175269,
"learning_rate": 1.771860742205988e-06,
"loss": 0.2512,
"step": 275
},
{
"epoch": 1.5071672354948804,
"grad_norm": 0.6528553917231606,
"learning_rate": 1.7355482237268983e-06,
"loss": 0.2382,
"step": 276
},
{
"epoch": 1.512627986348123,
"grad_norm": 0.5819350883243867,
"learning_rate": 1.6995333614298908e-06,
"loss": 0.2097,
"step": 277
},
{
"epoch": 1.5180887372013652,
"grad_norm": 0.6231114790212122,
"learning_rate": 1.6638194391883822e-06,
"loss": 0.2352,
"step": 278
},
{
"epoch": 1.5235494880546074,
"grad_norm": 0.6082248229800555,
"learning_rate": 1.6284097134357535e-06,
"loss": 0.2241,
"step": 279
},
{
"epoch": 1.52901023890785,
"grad_norm": 0.5824479788396233,
"learning_rate": 1.5933074128684333e-06,
"loss": 0.2149,
"step": 280
},
{
"epoch": 1.5344709897610922,
"grad_norm": 0.6414988574509947,
"learning_rate": 1.5585157381514875e-06,
"loss": 0.2629,
"step": 281
},
{
"epoch": 1.5399317406143345,
"grad_norm": 0.6281763626708758,
"learning_rate": 1.5240378616267887e-06,
"loss": 0.2213,
"step": 282
},
{
"epoch": 1.545392491467577,
"grad_norm": 0.6342381052621304,
"learning_rate": 1.4898769270237611e-06,
"loss": 0.2469,
"step": 283
},
{
"epoch": 1.550853242320819,
"grad_norm": 0.63348306460088,
"learning_rate": 1.4560360491727233e-06,
"loss": 0.2369,
"step": 284
},
{
"epoch": 1.5563139931740615,
"grad_norm": 0.6211597914243564,
"learning_rate": 1.4225183137208775e-06,
"loss": 0.2464,
"step": 285
},
{
"epoch": 1.5617747440273038,
"grad_norm": 0.6287566376737247,
"learning_rate": 1.389326776850966e-06,
"loss": 0.2378,
"step": 286
},
{
"epoch": 1.567235494880546,
"grad_norm": 0.6155227503256832,
"learning_rate": 1.3564644650025894e-06,
"loss": 0.2501,
"step": 287
},
{
"epoch": 1.5726962457337885,
"grad_norm": 0.6705552802557545,
"learning_rate": 1.323934374596268e-06,
"loss": 0.2642,
"step": 288
},
{
"epoch": 1.5781569965870306,
"grad_norm": 0.6468155028186762,
"learning_rate": 1.2917394717602123e-06,
"loss": 0.2391,
"step": 289
},
{
"epoch": 1.583617747440273,
"grad_norm": 0.6059123207863814,
"learning_rate": 1.2598826920598773e-06,
"loss": 0.2471,
"step": 290
},
{
"epoch": 1.5890784982935153,
"grad_norm": 0.622906276078447,
"learning_rate": 1.2283669402302878e-06,
"loss": 0.2441,
"step": 291
},
{
"epoch": 1.5945392491467576,
"grad_norm": 0.6123734431964898,
"learning_rate": 1.197195089911191e-06,
"loss": 0.2359,
"step": 292
},
{
"epoch": 1.6,
"grad_norm": 0.6173696097759185,
"learning_rate": 1.166369983385024e-06,
"loss": 0.2302,
"step": 293
},
{
"epoch": 1.6054607508532424,
"grad_norm": 0.6309581460125253,
"learning_rate": 1.1358944313177566e-06,
"loss": 0.2312,
"step": 294
},
{
"epoch": 1.6109215017064846,
"grad_norm": 0.6390016010862638,
"learning_rate": 1.1057712125026116e-06,
"loss": 0.2442,
"step": 295
},
{
"epoch": 1.6163822525597271,
"grad_norm": 0.6276135127186313,
"learning_rate": 1.0760030736066952e-06,
"loss": 0.2297,
"step": 296
},
{
"epoch": 1.6218430034129692,
"grad_norm": 0.612896281499447,
"learning_rate": 1.0465927289205452e-06,
"loss": 0.2346,
"step": 297
},
{
"epoch": 1.6273037542662117,
"grad_norm": 0.5859909784104106,
"learning_rate": 1.0175428601106441e-06,
"loss": 0.2119,
"step": 298
},
{
"epoch": 1.632764505119454,
"grad_norm": 0.6209786013367464,
"learning_rate": 9.888561159748995e-07,
"loss": 0.2432,
"step": 299
},
{
"epoch": 1.6382252559726962,
"grad_norm": 0.6352233419147338,
"learning_rate": 9.605351122011308e-07,
"loss": 0.2392,
"step": 300
},
{
"epoch": 1.6436860068259387,
"grad_norm": 0.5719749997080835,
"learning_rate": 9.325824311285564e-07,
"loss": 0.2173,
"step": 301
},
{
"epoch": 1.6491467576791807,
"grad_norm": 0.6657684609315078,
"learning_rate": 9.050006215123419e-07,
"loss": 0.2606,
"step": 302
},
{
"epoch": 1.6546075085324232,
"grad_norm": 0.5728781873883457,
"learning_rate": 8.777921982911996e-07,
"loss": 0.2214,
"step": 303
},
{
"epoch": 1.6600682593856655,
"grad_norm": 0.6162844560531199,
"learning_rate": 8.509596423580712e-07,
"loss": 0.2464,
"step": 304
},
{
"epoch": 1.6655290102389078,
"grad_norm": 0.5952374213138941,
"learning_rate": 8.245054003339247e-07,
"loss": 0.226,
"step": 305
},
{
"epoch": 1.6709897610921502,
"grad_norm": 0.5961891699890017,
"learning_rate": 7.984318843446593e-07,
"loss": 0.2221,
"step": 306
},
{
"epoch": 1.6764505119453925,
"grad_norm": 0.5937715818201527,
"learning_rate": 7.727414718011706e-07,
"loss": 0.2117,
"step": 307
},
{
"epoch": 1.6819112627986348,
"grad_norm": 0.6853681886295775,
"learning_rate": 7.474365051825749e-07,
"loss": 0.257,
"step": 308
},
{
"epoch": 1.6873720136518773,
"grad_norm": 0.6359287265633253,
"learning_rate": 7.225192918226215e-07,
"loss": 0.2395,
"step": 309
},
{
"epoch": 1.6928327645051193,
"grad_norm": 0.5963026735275871,
"learning_rate": 6.979921036993042e-07,
"loss": 0.2233,
"step": 310
},
{
"epoch": 1.6982935153583618,
"grad_norm": 0.6319949149208983,
"learning_rate": 6.738571772276997e-07,
"loss": 0.2416,
"step": 311
},
{
"epoch": 1.703754266211604,
"grad_norm": 0.6203658184125409,
"learning_rate": 6.501167130560515e-07,
"loss": 0.2283,
"step": 312
},
{
"epoch": 1.7092150170648464,
"grad_norm": 0.5938364868062161,
"learning_rate": 6.267728758651131e-07,
"loss": 0.2302,
"step": 313
},
{
"epoch": 1.7146757679180888,
"grad_norm": 0.6087142064405593,
"learning_rate": 6.038277941707671e-07,
"loss": 0.2039,
"step": 314
},
{
"epoch": 1.7201365187713311,
"grad_norm": 0.6401443657158584,
"learning_rate": 5.812835601299438e-07,
"loss": 0.254,
"step": 315
},
{
"epoch": 1.7255972696245734,
"grad_norm": 0.5597403672115617,
"learning_rate": 5.591422293498633e-07,
"loss": 0.2074,
"step": 316
},
{
"epoch": 1.7310580204778157,
"grad_norm": 0.6510913944426799,
"learning_rate": 5.374058207005945e-07,
"loss": 0.242,
"step": 317
},
{
"epoch": 1.736518771331058,
"grad_norm": 0.5851164556251767,
"learning_rate": 5.160763161309768e-07,
"loss": 0.2208,
"step": 318
},
{
"epoch": 1.7419795221843004,
"grad_norm": 0.5863228147924701,
"learning_rate": 4.951556604879049e-07,
"loss": 0.2167,
"step": 319
},
{
"epoch": 1.7474402730375427,
"grad_norm": 0.6434185214532853,
"learning_rate": 4.7464576133899043e-07,
"loss": 0.2208,
"step": 320
},
{
"epoch": 1.752901023890785,
"grad_norm": 0.6536708886817881,
"learning_rate": 4.545484887986368e-07,
"loss": 0.2558,
"step": 321
},
{
"epoch": 1.7583617747440274,
"grad_norm": 0.5913503349988765,
"learning_rate": 4.348656753575092e-07,
"loss": 0.2412,
"step": 322
},
{
"epoch": 1.7638225255972695,
"grad_norm": 0.6056503972371652,
"learning_rate": 4.1559911571545544e-07,
"loss": 0.2302,
"step": 323
},
{
"epoch": 1.769283276450512,
"grad_norm": 0.6287557474105558,
"learning_rate": 3.9675056661785563e-07,
"loss": 0.2184,
"step": 324
},
{
"epoch": 1.7747440273037542,
"grad_norm": 0.6387002743522794,
"learning_rate": 3.783217466954503e-07,
"loss": 0.2302,
"step": 325
},
{
"epoch": 1.7802047781569965,
"grad_norm": 0.5919282825342829,
"learning_rate": 3.603143363076217e-07,
"loss": 0.2155,
"step": 326
},
{
"epoch": 1.785665529010239,
"grad_norm": 0.6473381627525607,
"learning_rate": 3.427299773891868e-07,
"loss": 0.2661,
"step": 327
},
{
"epoch": 1.7911262798634813,
"grad_norm": 0.5931604711623788,
"learning_rate": 3.255702733006766e-07,
"loss": 0.2338,
"step": 328
},
{
"epoch": 1.7965870307167235,
"grad_norm": 0.6129008231127687,
"learning_rate": 3.088367886821481e-07,
"loss": 0.2514,
"step": 329
},
{
"epoch": 1.802047781569966,
"grad_norm": 0.6062339345391793,
"learning_rate": 2.925310493105099e-07,
"loss": 0.208,
"step": 330
},
{
"epoch": 1.807508532423208,
"grad_norm": 0.640691825777882,
"learning_rate": 2.7665454196040665e-07,
"loss": 0.2568,
"step": 331
},
{
"epoch": 1.8129692832764506,
"grad_norm": 0.6228877907424235,
"learning_rate": 2.6120871426864866e-07,
"loss": 0.2445,
"step": 332
},
{
"epoch": 1.8184300341296928,
"grad_norm": 0.6109720444847905,
"learning_rate": 2.4619497460222184e-07,
"loss": 0.2408,
"step": 333
},
{
"epoch": 1.823890784982935,
"grad_norm": 0.6345094654832795,
"learning_rate": 2.316146919298623e-07,
"loss": 0.221,
"step": 334
},
{
"epoch": 1.8293515358361776,
"grad_norm": 0.5612296424074616,
"learning_rate": 2.1746919569723858e-07,
"loss": 0.2137,
"step": 335
},
{
"epoch": 1.8348122866894196,
"grad_norm": 0.5831279872950338,
"learning_rate": 2.037597757057297e-07,
"loss": 0.2178,
"step": 336
},
{
"epoch": 1.8402730375426621,
"grad_norm": 0.639477342500686,
"learning_rate": 1.9048768199481983e-07,
"loss": 0.2417,
"step": 337
},
{
"epoch": 1.8457337883959044,
"grad_norm": 0.5861730216308635,
"learning_rate": 1.776541247281177e-07,
"loss": 0.229,
"step": 338
},
{
"epoch": 1.8511945392491467,
"grad_norm": 0.6019956045442054,
"learning_rate": 1.6526027408301227e-07,
"loss": 0.2212,
"step": 339
},
{
"epoch": 1.8566552901023892,
"grad_norm": 0.59132952240458,
"learning_rate": 1.5330726014397668e-07,
"loss": 0.2301,
"step": 340
},
{
"epoch": 1.8621160409556314,
"grad_norm": 0.6078005334703832,
"learning_rate": 1.417961727995254e-07,
"loss": 0.2239,
"step": 341
},
{
"epoch": 1.8675767918088737,
"grad_norm": 0.585630251596369,
"learning_rate": 1.307280616428336e-07,
"loss": 0.2093,
"step": 342
},
{
"epoch": 1.8730375426621162,
"grad_norm": 0.6003005675220531,
"learning_rate": 1.2010393587603975e-07,
"loss": 0.2558,
"step": 343
},
{
"epoch": 1.8784982935153582,
"grad_norm": 0.6007862550074166,
"learning_rate": 1.0992476421822052e-07,
"loss": 0.2217,
"step": 344
},
{
"epoch": 1.8839590443686007,
"grad_norm": 0.5798264754441842,
"learning_rate": 1.0019147481706626e-07,
"loss": 0.2,
"step": 345
},
{
"epoch": 1.889419795221843,
"grad_norm": 0.5801152259959127,
"learning_rate": 9.090495516424713e-08,
"loss": 0.2219,
"step": 346
},
{
"epoch": 1.8948805460750853,
"grad_norm": 0.5923845690395708,
"learning_rate": 8.206605201449447e-08,
"loss": 0.2029,
"step": 347
},
{
"epoch": 1.9003412969283278,
"grad_norm": 0.5987478859251119,
"learning_rate": 7.367557130838921e-08,
"loss": 0.2256,
"step": 348
},
{
"epoch": 1.9058020477815698,
"grad_norm": 0.5842539453063824,
"learning_rate": 6.573427809888067e-08,
"loss": 0.2003,
"step": 349
},
{
"epoch": 1.9112627986348123,
"grad_norm": 0.6351293922330473,
"learning_rate": 5.824289648152126e-08,
"loss": 0.2395,
"step": 350
},
{
"epoch": 1.9167235494880546,
"grad_norm": 0.6674020151826169,
"learning_rate": 5.120210952844873e-08,
"loss": 0.271,
"step": 351
},
{
"epoch": 1.9221843003412968,
"grad_norm": 0.5675294646121625,
"learning_rate": 4.461255922609986e-08,
"loss": 0.2172,
"step": 352
},
{
"epoch": 1.9276450511945393,
"grad_norm": 0.6285436049388992,
"learning_rate": 3.8474846416672874e-08,
"loss": 0.2399,
"step": 353
},
{
"epoch": 1.9331058020477816,
"grad_norm": 0.6175868696026672,
"learning_rate": 3.278953074334512e-08,
"loss": 0.2212,
"step": 354
},
{
"epoch": 1.9385665529010239,
"grad_norm": 0.6190764065365002,
"learning_rate": 2.75571305992417e-08,
"loss": 0.2374,
"step": 355
},
{
"epoch": 1.9440273037542664,
"grad_norm": 0.6366754795576353,
"learning_rate": 2.2778123080167136e-08,
"loss": 0.2317,
"step": 356
},
{
"epoch": 1.9494880546075084,
"grad_norm": 0.6088811286257059,
"learning_rate": 1.845294394110686e-08,
"loss": 0.2161,
"step": 357
},
{
"epoch": 1.954948805460751,
"grad_norm": 0.5983150959140915,
"learning_rate": 1.4581987556490095e-08,
"loss": 0.2273,
"step": 358
},
{
"epoch": 1.9604095563139932,
"grad_norm": 0.6172951456789394,
"learning_rate": 1.1165606884234182e-08,
"loss": 0.2514,
"step": 359
},
{
"epoch": 1.9658703071672354,
"grad_norm": 0.6110206348502005,
"learning_rate": 8.204113433559202e-09,
"loss": 0.2248,
"step": 360
},
{
"epoch": 1.971331058020478,
"grad_norm": 0.5973553403473014,
"learning_rate": 5.6977772365857105e-09,
"loss": 0.2415,
"step": 361
},
{
"epoch": 1.9767918088737202,
"grad_norm": 0.6109000206055947,
"learning_rate": 3.6468268237105364e-09,
"loss": 0.2319,
"step": 362
},
{
"epoch": 1.9822525597269625,
"grad_norm": 0.6357125881065677,
"learning_rate": 2.0514492027728928e-09,
"loss": 0.2457,
"step": 363
},
{
"epoch": 1.9877133105802047,
"grad_norm": 0.669166818877388,
"learning_rate": 9.117898419991333e-10,
"loss": 0.2574,
"step": 364
},
{
"epoch": 1.993174061433447,
"grad_norm": 0.6418127603308712,
"learning_rate": 2.2795265674113721e-10,
"loss": 0.2499,
"step": 365
},
{
"epoch": 1.9986348122866895,
"grad_norm": 0.6075592770961785,
"learning_rate": 0.0,
"loss": 0.2179,
"step": 366
},
{
"epoch": 1.9986348122866895,
"step": 366,
"total_flos": 86796684656640.0,
"train_loss": 0.2950779539965541,
"train_runtime": 2999.324,
"train_samples_per_second": 15.628,
"train_steps_per_second": 0.122
}
],
"logging_steps": 1,
"max_steps": 366,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 86796684656640.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}