OpenLongCoT-Base-Gemma2-2B / trainer_state.json
qq8933's picture
Upload folder using huggingface_hub
16bdc67 verified
raw
history blame
87.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 501,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001996007984031936,
"grad_norm": 92.81291961669922,
"learning_rate": 0.0,
"loss": 5.1027,
"step": 1
},
{
"epoch": 0.003992015968063872,
"grad_norm": 187.8679656982422,
"learning_rate": 5.017166594399687e-06,
"loss": 5.1552,
"step": 2
},
{
"epoch": 0.005988023952095809,
"grad_norm": 160.3046875,
"learning_rate": 7.952020911994375e-06,
"loss": 5.1408,
"step": 3
},
{
"epoch": 0.007984031936127744,
"grad_norm": 44.23579406738281,
"learning_rate": 1.0034333188799373e-05,
"loss": 3.2825,
"step": 4
},
{
"epoch": 0.00998003992015968,
"grad_norm": 22.931053161621094,
"learning_rate": 1.164950007226698e-05,
"loss": 2.5601,
"step": 5
},
{
"epoch": 0.011976047904191617,
"grad_norm": 10.358180046081543,
"learning_rate": 1.2969187506394062e-05,
"loss": 1.994,
"step": 6
},
{
"epoch": 0.013972055888223553,
"grad_norm": 9.385107040405273,
"learning_rate": 1.4084967333570947e-05,
"loss": 1.8568,
"step": 7
},
{
"epoch": 0.015968063872255488,
"grad_norm": 7.71799898147583,
"learning_rate": 1.505149978319906e-05,
"loss": 1.59,
"step": 8
},
{
"epoch": 0.017964071856287425,
"grad_norm": 5.006285190582275,
"learning_rate": 1.590404182398875e-05,
"loss": 1.3238,
"step": 9
},
{
"epoch": 0.01996007984031936,
"grad_norm": 4.380033493041992,
"learning_rate": 1.666666666666667e-05,
"loss": 1.267,
"step": 10
},
{
"epoch": 0.021956087824351298,
"grad_norm": 4.105769634246826,
"learning_rate": 1.7356544752637084e-05,
"loss": 1.2438,
"step": 11
},
{
"epoch": 0.023952095808383235,
"grad_norm": 3.476895809173584,
"learning_rate": 1.7986354100793748e-05,
"loss": 1.1196,
"step": 12
},
{
"epoch": 0.02594810379241517,
"grad_norm": 2.9743728637695312,
"learning_rate": 1.8565722538447282e-05,
"loss": 1.0148,
"step": 13
},
{
"epoch": 0.027944111776447105,
"grad_norm": 2.6658384799957275,
"learning_rate": 1.9102133927970633e-05,
"loss": 1.0081,
"step": 14
},
{
"epoch": 0.029940119760479042,
"grad_norm": 2.6062169075012207,
"learning_rate": 1.9601520984261358e-05,
"loss": 0.9228,
"step": 15
},
{
"epoch": 0.031936127744510975,
"grad_norm": 2.147310495376587,
"learning_rate": 2.0068666377598747e-05,
"loss": 0.8351,
"step": 16
},
{
"epoch": 0.033932135728542916,
"grad_norm": 2.2878642082214355,
"learning_rate": 2.0507482022971233e-05,
"loss": 0.8303,
"step": 17
},
{
"epoch": 0.03592814371257485,
"grad_norm": 2.077786445617676,
"learning_rate": 2.0921208418388435e-05,
"loss": 0.7769,
"step": 18
},
{
"epoch": 0.03792415169660679,
"grad_norm": 2.115493059158325,
"learning_rate": 2.1312560015880482e-05,
"loss": 0.8032,
"step": 19
},
{
"epoch": 0.03992015968063872,
"grad_norm": 1.92618989944458,
"learning_rate": 2.1683833261066357e-05,
"loss": 0.7759,
"step": 20
},
{
"epoch": 0.041916167664670656,
"grad_norm": 1.9667437076568604,
"learning_rate": 2.2036988245565324e-05,
"loss": 0.7805,
"step": 21
},
{
"epoch": 0.043912175648702596,
"grad_norm": 2.3144752979278564,
"learning_rate": 2.2373711347036773e-05,
"loss": 0.735,
"step": 22
},
{
"epoch": 0.04590818363273453,
"grad_norm": 1.555964469909668,
"learning_rate": 2.269546393362655e-05,
"loss": 0.6523,
"step": 23
},
{
"epoch": 0.04790419161676647,
"grad_norm": 1.5024523735046387,
"learning_rate": 2.3003520695193437e-05,
"loss": 0.6623,
"step": 24
},
{
"epoch": 0.0499001996007984,
"grad_norm": 1.522902011871338,
"learning_rate": 2.329900014453396e-05,
"loss": 0.6503,
"step": 25
},
{
"epoch": 0.05189620758483034,
"grad_norm": 1.4194371700286865,
"learning_rate": 2.3582889132846968e-05,
"loss": 0.636,
"step": 26
},
{
"epoch": 0.05389221556886228,
"grad_norm": 1.55453360080719,
"learning_rate": 2.3856062735983123e-05,
"loss": 0.7242,
"step": 27
},
{
"epoch": 0.05588822355289421,
"grad_norm": 1.4471536874771118,
"learning_rate": 2.4119300522370322e-05,
"loss": 0.5819,
"step": 28
},
{
"epoch": 0.05788423153692615,
"grad_norm": 1.5161927938461304,
"learning_rate": 2.4373299964982603e-05,
"loss": 0.6788,
"step": 29
},
{
"epoch": 0.059880239520958084,
"grad_norm": 1.5962581634521484,
"learning_rate": 2.4618687578661044e-05,
"loss": 0.7346,
"step": 30
},
{
"epoch": 0.06187624750499002,
"grad_norm": 1.3744760751724243,
"learning_rate": 2.4856028230571212e-05,
"loss": 0.4835,
"step": 31
},
{
"epoch": 0.06387225548902195,
"grad_norm": 1.7308415174484253,
"learning_rate": 2.5085832971998436e-05,
"loss": 0.6537,
"step": 32
},
{
"epoch": 0.0658682634730539,
"grad_norm": 1.736331582069397,
"learning_rate": 2.530856566463146e-05,
"loss": 0.6633,
"step": 33
},
{
"epoch": 0.06786427145708583,
"grad_norm": 1.9755364656448364,
"learning_rate": 2.552464861737092e-05,
"loss": 0.6268,
"step": 34
},
{
"epoch": 0.06986027944111776,
"grad_norm": 1.6539369821548462,
"learning_rate": 2.5734467405837933e-05,
"loss": 0.6355,
"step": 35
},
{
"epoch": 0.0718562874251497,
"grad_norm": 2.0570621490478516,
"learning_rate": 2.5938375012788124e-05,
"loss": 0.6168,
"step": 36
},
{
"epoch": 0.07385229540918163,
"grad_norm": 1.8512474298477173,
"learning_rate": 2.6136695401116585e-05,
"loss": 0.6583,
"step": 37
},
{
"epoch": 0.07584830339321358,
"grad_norm": 1.2911862134933472,
"learning_rate": 2.6329726610280168e-05,
"loss": 0.565,
"step": 38
},
{
"epoch": 0.07784431137724551,
"grad_norm": 1.475156545639038,
"learning_rate": 2.651774345044166e-05,
"loss": 0.6409,
"step": 39
},
{
"epoch": 0.07984031936127745,
"grad_norm": 1.1098164319992065,
"learning_rate": 2.6700999855466042e-05,
"loss": 0.5335,
"step": 40
},
{
"epoch": 0.08183632734530938,
"grad_norm": 1.1890451908111572,
"learning_rate": 2.687973094532893e-05,
"loss": 0.4502,
"step": 41
},
{
"epoch": 0.08383233532934131,
"grad_norm": 1.9120031595230103,
"learning_rate": 2.7054154839965013e-05,
"loss": 0.607,
"step": 42
},
{
"epoch": 0.08582834331337326,
"grad_norm": 1.2188658714294434,
"learning_rate": 2.722447425965978e-05,
"loss": 0.5033,
"step": 43
},
{
"epoch": 0.08782435129740519,
"grad_norm": 1.3608094453811646,
"learning_rate": 2.739087794143646e-05,
"loss": 0.5956,
"step": 44
},
{
"epoch": 0.08982035928143713,
"grad_norm": 1.259487271308899,
"learning_rate": 2.755354189625573e-05,
"loss": 0.575,
"step": 45
},
{
"epoch": 0.09181636726546906,
"grad_norm": 1.2308496236801147,
"learning_rate": 2.771263052802624e-05,
"loss": 0.6473,
"step": 46
},
{
"epoch": 0.09381237524950099,
"grad_norm": 1.2072350978851318,
"learning_rate": 2.7868297632261957e-05,
"loss": 0.6273,
"step": 47
},
{
"epoch": 0.09580838323353294,
"grad_norm": 1.150260090827942,
"learning_rate": 2.8020687289593123e-05,
"loss": 0.618,
"step": 48
},
{
"epoch": 0.09780439121756487,
"grad_norm": 1.1447213888168335,
"learning_rate": 2.8169934667141895e-05,
"loss": 0.625,
"step": 49
},
{
"epoch": 0.0998003992015968,
"grad_norm": 1.1371378898620605,
"learning_rate": 2.8316166738933646e-05,
"loss": 0.6372,
"step": 50
},
{
"epoch": 0.10179640718562874,
"grad_norm": 1.1135759353637695,
"learning_rate": 2.845950293496561e-05,
"loss": 0.539,
"step": 51
},
{
"epoch": 0.10379241516966067,
"grad_norm": 0.9502639174461365,
"learning_rate": 2.8600055727246657e-05,
"loss": 0.388,
"step": 52
},
{
"epoch": 0.10578842315369262,
"grad_norm": 1.545538306236267,
"learning_rate": 2.8737931160013153e-05,
"loss": 0.5401,
"step": 53
},
{
"epoch": 0.10778443113772455,
"grad_norm": 1.223322868347168,
"learning_rate": 2.8873229330382812e-05,
"loss": 0.5695,
"step": 54
},
{
"epoch": 0.10978043912175649,
"grad_norm": 1.0864529609680176,
"learning_rate": 2.9006044824904066e-05,
"loss": 0.4986,
"step": 55
},
{
"epoch": 0.11177644710578842,
"grad_norm": 1.1569509506225586,
"learning_rate": 2.913646711677001e-05,
"loss": 0.5629,
"step": 56
},
{
"epoch": 0.11377245508982035,
"grad_norm": 1.3813297748565674,
"learning_rate": 2.926458092787486e-05,
"loss": 0.605,
"step": 57
},
{
"epoch": 0.1157684630738523,
"grad_norm": 1.034891128540039,
"learning_rate": 2.939046655938229e-05,
"loss": 0.5247,
"step": 58
},
{
"epoch": 0.11776447105788423,
"grad_norm": 1.0968964099884033,
"learning_rate": 2.951420019403574e-05,
"loss": 0.5797,
"step": 59
},
{
"epoch": 0.11976047904191617,
"grad_norm": 1.0885212421417236,
"learning_rate": 2.963585417306073e-05,
"loss": 0.5689,
"step": 60
},
{
"epoch": 0.1217564870259481,
"grad_norm": 1.2548822164535522,
"learning_rate": 2.9755497250179453e-05,
"loss": 0.5559,
"step": 61
},
{
"epoch": 0.12375249500998003,
"grad_norm": 1.009814739227295,
"learning_rate": 2.98731948249709e-05,
"loss": 0.4973,
"step": 62
},
{
"epoch": 0.12574850299401197,
"grad_norm": 1.0727399587631226,
"learning_rate": 2.9989009157559694e-05,
"loss": 0.5439,
"step": 63
},
{
"epoch": 0.1277445109780439,
"grad_norm": 1.1233041286468506,
"learning_rate": 3.010299956639812e-05,
"loss": 0.5472,
"step": 64
},
{
"epoch": 0.12974051896207583,
"grad_norm": 1.1565264463424683,
"learning_rate": 3.021522261071426e-05,
"loss": 0.6008,
"step": 65
},
{
"epoch": 0.1317365269461078,
"grad_norm": 0.9942654371261597,
"learning_rate": 3.0325732259031143e-05,
"loss": 0.4552,
"step": 66
},
{
"epoch": 0.13373253493013973,
"grad_norm": 1.100710153579712,
"learning_rate": 3.043458004501377e-05,
"loss": 0.4661,
"step": 67
},
{
"epoch": 0.13572854291417166,
"grad_norm": 1.0481464862823486,
"learning_rate": 3.054181521177061e-05,
"loss": 0.4996,
"step": 68
},
{
"epoch": 0.1377245508982036,
"grad_norm": 1.085190773010254,
"learning_rate": 3.064748484562093e-05,
"loss": 0.5589,
"step": 69
},
{
"epoch": 0.13972055888223553,
"grad_norm": 1.0909191370010376,
"learning_rate": 3.0751634000237615e-05,
"loss": 0.5948,
"step": 70
},
{
"epoch": 0.14171656686626746,
"grad_norm": 1.9369421005249023,
"learning_rate": 3.085430581198459e-05,
"loss": 0.5384,
"step": 71
},
{
"epoch": 0.1437125748502994,
"grad_norm": 1.1248409748077393,
"learning_rate": 3.095554160718781e-05,
"loss": 0.4915,
"step": 72
},
{
"epoch": 0.14570858283433133,
"grad_norm": 1.028275728225708,
"learning_rate": 3.10553810020076e-05,
"loss": 0.5405,
"step": 73
},
{
"epoch": 0.14770459081836326,
"grad_norm": 0.9245263338088989,
"learning_rate": 3.115386199551628e-05,
"loss": 0.4313,
"step": 74
},
{
"epoch": 0.1497005988023952,
"grad_norm": 1.0587871074676514,
"learning_rate": 3.1251021056528336e-05,
"loss": 0.5154,
"step": 75
},
{
"epoch": 0.15169660678642716,
"grad_norm": 1.0819029808044434,
"learning_rate": 3.134689320467986e-05,
"loss": 0.5097,
"step": 76
},
{
"epoch": 0.1536926147704591,
"grad_norm": 1.0212074518203735,
"learning_rate": 3.144151208620804e-05,
"loss": 0.4365,
"step": 77
},
{
"epoch": 0.15568862275449102,
"grad_norm": 1.140681266784668,
"learning_rate": 3.1534910044841344e-05,
"loss": 0.5734,
"step": 78
},
{
"epoch": 0.15768463073852296,
"grad_norm": 1.0276720523834229,
"learning_rate": 3.1627118188174024e-05,
"loss": 0.42,
"step": 79
},
{
"epoch": 0.1596806387225549,
"grad_norm": 0.980180025100708,
"learning_rate": 3.171816644986573e-05,
"loss": 0.4796,
"step": 80
},
{
"epoch": 0.16167664670658682,
"grad_norm": 1.198864221572876,
"learning_rate": 3.18080836479775e-05,
"loss": 0.5675,
"step": 81
},
{
"epoch": 0.16367265469061876,
"grad_norm": 0.9353108406066895,
"learning_rate": 3.1896897539728616e-05,
"loss": 0.5183,
"step": 82
},
{
"epoch": 0.1656686626746507,
"grad_norm": 0.9708541035652161,
"learning_rate": 3.198463487293457e-05,
"loss": 0.4513,
"step": 83
},
{
"epoch": 0.16766467065868262,
"grad_norm": 1.1432932615280151,
"learning_rate": 3.207132143436469e-05,
"loss": 0.589,
"step": 84
},
{
"epoch": 0.16966067864271456,
"grad_norm": 1.0964723825454712,
"learning_rate": 3.215698209523821e-05,
"loss": 0.5101,
"step": 85
},
{
"epoch": 0.17165668662674652,
"grad_norm": 1.0808310508728027,
"learning_rate": 3.224164085405946e-05,
"loss": 0.4349,
"step": 86
},
{
"epoch": 0.17365269461077845,
"grad_norm": 1.0994106531143188,
"learning_rate": 3.232532087697698e-05,
"loss": 0.4965,
"step": 87
},
{
"epoch": 0.17564870259481039,
"grad_norm": 1.2377325296401978,
"learning_rate": 3.240804453583615e-05,
"loss": 0.4444,
"step": 88
},
{
"epoch": 0.17764471057884232,
"grad_norm": 1.0575945377349854,
"learning_rate": 3.248983344408188e-05,
"loss": 0.4379,
"step": 89
},
{
"epoch": 0.17964071856287425,
"grad_norm": 0.8877758979797363,
"learning_rate": 3.2570708490655414e-05,
"loss": 0.453,
"step": 90
},
{
"epoch": 0.18163672654690619,
"grad_norm": 1.0481340885162354,
"learning_rate": 3.265068987201822e-05,
"loss": 0.519,
"step": 91
},
{
"epoch": 0.18363273453093812,
"grad_norm": 1.026150107383728,
"learning_rate": 3.2729797122425925e-05,
"loss": 0.5112,
"step": 92
},
{
"epoch": 0.18562874251497005,
"grad_norm": 0.8472252488136292,
"learning_rate": 3.280804914256559e-05,
"loss": 0.4302,
"step": 93
},
{
"epoch": 0.18762475049900199,
"grad_norm": 0.9228626489639282,
"learning_rate": 3.288546422666164e-05,
"loss": 0.4814,
"step": 94
},
{
"epoch": 0.18962075848303392,
"grad_norm": 1.0165542364120483,
"learning_rate": 3.2962060088147464e-05,
"loss": 0.5035,
"step": 95
},
{
"epoch": 0.19161676646706588,
"grad_norm": 1.091426134109497,
"learning_rate": 3.3037853883992805e-05,
"loss": 0.5718,
"step": 96
},
{
"epoch": 0.1936127744510978,
"grad_norm": 1.0953468084335327,
"learning_rate": 3.3112862237770756e-05,
"loss": 0.5522,
"step": 97
},
{
"epoch": 0.19560878243512975,
"grad_norm": 0.9461252689361572,
"learning_rate": 3.3187101261541584e-05,
"loss": 0.5257,
"step": 98
},
{
"epoch": 0.19760479041916168,
"grad_norm": 1.063242793083191,
"learning_rate": 3.326058657662584e-05,
"loss": 0.511,
"step": 99
},
{
"epoch": 0.1996007984031936,
"grad_norm": 1.0084831714630127,
"learning_rate": 3.333333333333334e-05,
"loss": 0.5182,
"step": 100
},
{
"epoch": 0.20159680638722555,
"grad_norm": 0.9839895963668823,
"learning_rate": 3.340535622971072e-05,
"loss": 0.4776,
"step": 101
},
{
"epoch": 0.20359281437125748,
"grad_norm": 0.9757642149925232,
"learning_rate": 3.3476669529365295e-05,
"loss": 0.4915,
"step": 102
},
{
"epoch": 0.2055888223552894,
"grad_norm": 0.8425347208976746,
"learning_rate": 3.3547287078419544e-05,
"loss": 0.3955,
"step": 103
},
{
"epoch": 0.20758483033932135,
"grad_norm": 0.9176936745643616,
"learning_rate": 3.361722232164634e-05,
"loss": 0.4132,
"step": 104
},
{
"epoch": 0.20958083832335328,
"grad_norm": 1.0560258626937866,
"learning_rate": 3.3686488317832306e-05,
"loss": 0.5133,
"step": 105
},
{
"epoch": 0.21157684630738524,
"grad_norm": 0.9101148247718811,
"learning_rate": 3.375509775441284e-05,
"loss": 0.3898,
"step": 106
},
{
"epoch": 0.21357285429141717,
"grad_norm": 0.8682689666748047,
"learning_rate": 3.382306296142016e-05,
"loss": 0.4353,
"step": 107
},
{
"epoch": 0.2155688622754491,
"grad_norm": 0.8694739937782288,
"learning_rate": 3.38903959247825e-05,
"loss": 0.5008,
"step": 108
},
{
"epoch": 0.21756487025948104,
"grad_norm": 0.8936677575111389,
"learning_rate": 3.395710829901039e-05,
"loss": 0.4203,
"step": 109
},
{
"epoch": 0.21956087824351297,
"grad_norm": 0.936951220035553,
"learning_rate": 3.402321141930376e-05,
"loss": 0.4798,
"step": 110
},
{
"epoch": 0.2215568862275449,
"grad_norm": 0.8947778344154358,
"learning_rate": 3.4088716313110955e-05,
"loss": 0.4855,
"step": 111
},
{
"epoch": 0.22355289421157684,
"grad_norm": 0.8714671730995178,
"learning_rate": 3.415363371116969e-05,
"loss": 0.4973,
"step": 112
},
{
"epoch": 0.22554890219560877,
"grad_norm": 0.8940010070800781,
"learning_rate": 3.4217974058057e-05,
"loss": 0.5217,
"step": 113
},
{
"epoch": 0.2275449101796407,
"grad_norm": 0.8057599663734436,
"learning_rate": 3.428174752227455e-05,
"loss": 0.3906,
"step": 114
},
{
"epoch": 0.22954091816367264,
"grad_norm": 1.0616763830184937,
"learning_rate": 3.434496400589353e-05,
"loss": 0.4958,
"step": 115
},
{
"epoch": 0.2315369261477046,
"grad_norm": 0.8679594993591309,
"learning_rate": 3.440763315378198e-05,
"loss": 0.4526,
"step": 116
},
{
"epoch": 0.23353293413173654,
"grad_norm": 0.8972085118293762,
"learning_rate": 3.446976436243603e-05,
"loss": 0.4559,
"step": 117
},
{
"epoch": 0.23552894211576847,
"grad_norm": 0.9083353877067566,
"learning_rate": 3.4531366788435425e-05,
"loss": 0.5048,
"step": 118
},
{
"epoch": 0.2375249500998004,
"grad_norm": 0.8607695698738098,
"learning_rate": 3.459244935654219e-05,
"loss": 0.4128,
"step": 119
},
{
"epoch": 0.23952095808383234,
"grad_norm": 0.8851041793823242,
"learning_rate": 3.465302076746041e-05,
"loss": 0.4602,
"step": 120
},
{
"epoch": 0.24151696606786427,
"grad_norm": 0.9059931039810181,
"learning_rate": 3.471308950527417e-05,
"loss": 0.4791,
"step": 121
},
{
"epoch": 0.2435129740518962,
"grad_norm": 0.9063411951065063,
"learning_rate": 3.477266384457914e-05,
"loss": 0.4741,
"step": 122
},
{
"epoch": 0.24550898203592814,
"grad_norm": 0.8850985765457153,
"learning_rate": 3.48317518573233e-05,
"loss": 0.4292,
"step": 123
},
{
"epoch": 0.24750499001996007,
"grad_norm": 0.9396518468856812,
"learning_rate": 3.489036141937059e-05,
"loss": 0.5069,
"step": 124
},
{
"epoch": 0.249500998003992,
"grad_norm": 0.9115111231803894,
"learning_rate": 3.494850021680094e-05,
"loss": 0.4823,
"step": 125
},
{
"epoch": 0.25149700598802394,
"grad_norm": 0.8799051642417908,
"learning_rate": 3.500617575195938e-05,
"loss": 0.3732,
"step": 126
},
{
"epoch": 0.25349301397205587,
"grad_norm": 0.9273744821548462,
"learning_rate": 3.5063395349265945e-05,
"loss": 0.4284,
"step": 127
},
{
"epoch": 0.2554890219560878,
"grad_norm": 1.0624243021011353,
"learning_rate": 3.5120166160797804e-05,
"loss": 0.4322,
"step": 128
},
{
"epoch": 0.25748502994011974,
"grad_norm": 0.8508513569831848,
"learning_rate": 3.517649517165415e-05,
"loss": 0.4465,
"step": 129
},
{
"epoch": 0.25948103792415167,
"grad_norm": 0.8986352682113647,
"learning_rate": 3.523238920511395e-05,
"loss": 0.4093,
"step": 130
},
{
"epoch": 0.26147704590818366,
"grad_norm": 0.9224410653114319,
"learning_rate": 3.528785492759607e-05,
"loss": 0.4735,
"step": 131
},
{
"epoch": 0.2634730538922156,
"grad_norm": 0.9467160105705261,
"learning_rate": 3.5342898853430836e-05,
"loss": 0.4952,
"step": 132
},
{
"epoch": 0.2654690618762475,
"grad_norm": 0.9140215516090393,
"learning_rate": 3.539752734945143e-05,
"loss": 0.4516,
"step": 133
},
{
"epoch": 0.26746506986027946,
"grad_norm": 0.9906129837036133,
"learning_rate": 3.5451746639413466e-05,
"loss": 0.3785,
"step": 134
},
{
"epoch": 0.2694610778443114,
"grad_norm": 0.8118170499801636,
"learning_rate": 3.550556280825011e-05,
"loss": 0.4324,
"step": 135
},
{
"epoch": 0.2714570858283433,
"grad_norm": 0.9162650108337402,
"learning_rate": 3.55589818061703e-05,
"loss": 0.3836,
"step": 136
},
{
"epoch": 0.27345309381237526,
"grad_norm": 0.8672250509262085,
"learning_rate": 3.561200945260678e-05,
"loss": 0.4462,
"step": 137
},
{
"epoch": 0.2754491017964072,
"grad_norm": 0.906155526638031,
"learning_rate": 3.5664651440020616e-05,
"loss": 0.4749,
"step": 138
},
{
"epoch": 0.2774451097804391,
"grad_norm": 0.9452763199806213,
"learning_rate": 3.571691333756825e-05,
"loss": 0.4782,
"step": 139
},
{
"epoch": 0.27944111776447106,
"grad_norm": 0.8917446136474609,
"learning_rate": 3.5768800594637304e-05,
"loss": 0.4401,
"step": 140
},
{
"epoch": 0.281437125748503,
"grad_norm": 0.882606029510498,
"learning_rate": 3.582031854425634e-05,
"loss": 0.4992,
"step": 141
},
{
"epoch": 0.2834331337325349,
"grad_norm": 0.870290219783783,
"learning_rate": 3.587147240638428e-05,
"loss": 0.5009,
"step": 142
},
{
"epoch": 0.28542914171656686,
"grad_norm": 0.8788816332817078,
"learning_rate": 3.5922267291084366e-05,
"loss": 0.3891,
"step": 143
},
{
"epoch": 0.2874251497005988,
"grad_norm": 0.8944652676582336,
"learning_rate": 3.5972708201587496e-05,
"loss": 0.442,
"step": 144
},
{
"epoch": 0.2894211576846307,
"grad_norm": 0.8970728516578674,
"learning_rate": 3.6022800037249585e-05,
"loss": 0.5065,
"step": 145
},
{
"epoch": 0.29141716566866266,
"grad_norm": 0.9061855673789978,
"learning_rate": 3.607254759640729e-05,
"loss": 0.4617,
"step": 146
},
{
"epoch": 0.2934131736526946,
"grad_norm": 0.851344883441925,
"learning_rate": 3.612195557913627e-05,
"loss": 0.4633,
"step": 147
},
{
"epoch": 0.2954091816367265,
"grad_norm": 0.8392930626869202,
"learning_rate": 3.6171028589915954e-05,
"loss": 0.434,
"step": 148
},
{
"epoch": 0.29740518962075846,
"grad_norm": 0.8495596051216125,
"learning_rate": 3.6219771140204575e-05,
"loss": 0.4627,
"step": 149
},
{
"epoch": 0.2994011976047904,
"grad_norm": 0.8151164650917053,
"learning_rate": 3.626818765092802e-05,
"loss": 0.4152,
"step": 150
},
{
"epoch": 0.3013972055888224,
"grad_norm": 0.9488523602485657,
"learning_rate": 3.6316282454886157e-05,
"loss": 0.4912,
"step": 151
},
{
"epoch": 0.3033932135728543,
"grad_norm": 0.6952376365661621,
"learning_rate": 3.636405979907955e-05,
"loss": 0.3153,
"step": 152
},
{
"epoch": 0.30538922155688625,
"grad_norm": 0.8647618293762207,
"learning_rate": 3.6411523846959985e-05,
"loss": 0.4619,
"step": 153
},
{
"epoch": 0.3073852295409182,
"grad_norm": 0.8178197741508484,
"learning_rate": 3.645867868060772e-05,
"loss": 0.5165,
"step": 154
},
{
"epoch": 0.3093812375249501,
"grad_norm": 0.8717004060745239,
"learning_rate": 3.6505528302838193e-05,
"loss": 0.4222,
"step": 155
},
{
"epoch": 0.31137724550898205,
"grad_norm": 0.867859423160553,
"learning_rate": 3.6552076639241027e-05,
"loss": 0.4882,
"step": 156
},
{
"epoch": 0.313373253493014,
"grad_norm": 0.8131747841835022,
"learning_rate": 3.65983275401539e-05,
"loss": 0.4171,
"step": 157
},
{
"epoch": 0.3153692614770459,
"grad_norm": 0.8518748879432678,
"learning_rate": 3.664428478257371e-05,
"loss": 0.4342,
"step": 158
},
{
"epoch": 0.31736526946107785,
"grad_norm": 0.8354836702346802,
"learning_rate": 3.668995207200753e-05,
"loss": 0.4698,
"step": 159
},
{
"epoch": 0.3193612774451098,
"grad_norm": 0.9375539422035217,
"learning_rate": 3.673533304426541e-05,
"loss": 0.4896,
"step": 160
},
{
"epoch": 0.3213572854291417,
"grad_norm": 0.8951889872550964,
"learning_rate": 3.67804312671975e-05,
"loss": 0.4997,
"step": 161
},
{
"epoch": 0.32335329341317365,
"grad_norm": 0.8014180064201355,
"learning_rate": 3.682525024237719e-05,
"loss": 0.47,
"step": 162
},
{
"epoch": 0.3253493013972056,
"grad_norm": 0.8288528323173523,
"learning_rate": 3.6869793406732636e-05,
"loss": 0.4085,
"step": 163
},
{
"epoch": 0.3273453093812375,
"grad_norm": 0.8221442699432373,
"learning_rate": 3.69140641341283e-05,
"loss": 0.4329,
"step": 164
},
{
"epoch": 0.32934131736526945,
"grad_norm": 0.7562230825424194,
"learning_rate": 3.695806573689844e-05,
"loss": 0.348,
"step": 165
},
{
"epoch": 0.3313373253493014,
"grad_norm": 0.9237514138221741,
"learning_rate": 3.700180146733426e-05,
"loss": 0.4342,
"step": 166
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.9912509918212891,
"learning_rate": 3.704527451912639e-05,
"loss": 0.4809,
"step": 167
},
{
"epoch": 0.33532934131736525,
"grad_norm": 0.8607332110404968,
"learning_rate": 3.708848802876438e-05,
"loss": 0.4586,
"step": 168
},
{
"epoch": 0.3373253493013972,
"grad_norm": 0.8513320684432983,
"learning_rate": 3.7131445076894564e-05,
"loss": 0.4471,
"step": 169
},
{
"epoch": 0.3393213572854291,
"grad_norm": 0.8249276876449585,
"learning_rate": 3.717414868963791e-05,
"loss": 0.3867,
"step": 170
},
{
"epoch": 0.3413173652694611,
"grad_norm": 0.9111822843551636,
"learning_rate": 3.721660183986924e-05,
"loss": 0.4502,
"step": 171
},
{
"epoch": 0.34331337325349304,
"grad_norm": 0.7368887066841125,
"learning_rate": 3.725880744845915e-05,
"loss": 0.3507,
"step": 172
},
{
"epoch": 0.34530938123752497,
"grad_norm": 0.8149043917655945,
"learning_rate": 3.730076838547993e-05,
"loss": 0.3512,
"step": 173
},
{
"epoch": 0.3473053892215569,
"grad_norm": 0.9387815594673157,
"learning_rate": 3.734248747137666e-05,
"loss": 0.4263,
"step": 174
},
{
"epoch": 0.34930139720558884,
"grad_norm": 0.860095202922821,
"learning_rate": 3.738396747810492e-05,
"loss": 0.3923,
"step": 175
},
{
"epoch": 0.35129740518962077,
"grad_norm": 0.7961985468864441,
"learning_rate": 3.7425211130235834e-05,
"loss": 0.3811,
"step": 176
},
{
"epoch": 0.3532934131736527,
"grad_norm": 0.8483626842498779,
"learning_rate": 3.7466221106030115e-05,
"loss": 0.4152,
"step": 177
},
{
"epoch": 0.35528942115768464,
"grad_norm": 0.7593052387237549,
"learning_rate": 3.750700003848157e-05,
"loss": 0.3321,
"step": 178
},
{
"epoch": 0.35728542914171657,
"grad_norm": 0.8519895672798157,
"learning_rate": 3.7547550516331555e-05,
"loss": 0.4008,
"step": 179
},
{
"epoch": 0.3592814371257485,
"grad_norm": 0.8514580726623535,
"learning_rate": 3.75878750850551e-05,
"loss": 0.408,
"step": 180
},
{
"epoch": 0.36127744510978044,
"grad_norm": 0.8409926891326904,
"learning_rate": 3.7627976247819744e-05,
"loss": 0.4076,
"step": 181
},
{
"epoch": 0.36327345309381237,
"grad_norm": 0.7313259840011597,
"learning_rate": 3.766785646641792e-05,
"loss": 0.4311,
"step": 182
},
{
"epoch": 0.3652694610778443,
"grad_norm": 0.7503537535667419,
"learning_rate": 3.770751816217383e-05,
"loss": 0.422,
"step": 183
},
{
"epoch": 0.36726546906187624,
"grad_norm": 0.7808623313903809,
"learning_rate": 3.7746963716825615e-05,
"loss": 0.4475,
"step": 184
},
{
"epoch": 0.36926147704590817,
"grad_norm": 0.6921509504318237,
"learning_rate": 3.778619547338356e-05,
"loss": 0.3981,
"step": 185
},
{
"epoch": 0.3712574850299401,
"grad_norm": 0.7929064035415649,
"learning_rate": 3.782521573696528e-05,
"loss": 0.4482,
"step": 186
},
{
"epoch": 0.37325349301397204,
"grad_norm": 0.7118304371833801,
"learning_rate": 3.786402677560832e-05,
"loss": 0.3413,
"step": 187
},
{
"epoch": 0.37524950099800397,
"grad_norm": 0.7609389424324036,
"learning_rate": 3.790263082106134e-05,
"loss": 0.4207,
"step": 188
},
{
"epoch": 0.3772455089820359,
"grad_norm": 0.8060720562934875,
"learning_rate": 3.794103006955407e-05,
"loss": 0.4521,
"step": 189
},
{
"epoch": 0.37924151696606784,
"grad_norm": 0.8100878596305847,
"learning_rate": 3.797922668254715e-05,
"loss": 0.3653,
"step": 190
},
{
"epoch": 0.3812375249500998,
"grad_norm": 0.8395611047744751,
"learning_rate": 3.801722278746213e-05,
"loss": 0.3662,
"step": 191
},
{
"epoch": 0.38323353293413176,
"grad_norm": 0.7541958093643188,
"learning_rate": 3.8055020478392495e-05,
"loss": 0.2939,
"step": 192
},
{
"epoch": 0.3852295409181637,
"grad_norm": 0.8053567409515381,
"learning_rate": 3.809262181679623e-05,
"loss": 0.4302,
"step": 193
},
{
"epoch": 0.3872255489021956,
"grad_norm": 0.8586562275886536,
"learning_rate": 3.813002883217044e-05,
"loss": 0.3984,
"step": 194
},
{
"epoch": 0.38922155688622756,
"grad_norm": 0.7566971778869629,
"learning_rate": 3.816724352270863e-05,
"loss": 0.3839,
"step": 195
},
{
"epoch": 0.3912175648702595,
"grad_norm": 0.8142690658569336,
"learning_rate": 3.8204267855941266e-05,
"loss": 0.4014,
"step": 196
},
{
"epoch": 0.3932135728542914,
"grad_norm": 0.7769673466682434,
"learning_rate": 3.824110376935989e-05,
"loss": 0.3791,
"step": 197
},
{
"epoch": 0.39520958083832336,
"grad_norm": 0.8861010670661926,
"learning_rate": 3.827775317102552e-05,
"loss": 0.3904,
"step": 198
},
{
"epoch": 0.3972055888223553,
"grad_norm": 0.7677756547927856,
"learning_rate": 3.831421794016178e-05,
"loss": 0.3933,
"step": 199
},
{
"epoch": 0.3992015968063872,
"grad_norm": 0.9913503527641296,
"learning_rate": 3.835049992773302e-05,
"loss": 0.439,
"step": 200
},
{
"epoch": 0.40119760479041916,
"grad_norm": 0.847613513469696,
"learning_rate": 3.838660095700815e-05,
"loss": 0.4462,
"step": 201
},
{
"epoch": 0.4031936127744511,
"grad_norm": 0.8353721499443054,
"learning_rate": 3.84225228241104e-05,
"loss": 0.4621,
"step": 202
},
{
"epoch": 0.405189620758483,
"grad_norm": 0.809059739112854,
"learning_rate": 3.8458267298553554e-05,
"loss": 0.4888,
"step": 203
},
{
"epoch": 0.40718562874251496,
"grad_norm": 0.7496485710144043,
"learning_rate": 3.8493836123764984e-05,
"loss": 0.3836,
"step": 204
},
{
"epoch": 0.4091816367265469,
"grad_norm": 0.9037646651268005,
"learning_rate": 3.852923101759591e-05,
"loss": 0.3993,
"step": 205
},
{
"epoch": 0.4111776447105788,
"grad_norm": 0.8741063475608826,
"learning_rate": 3.856445367281923e-05,
"loss": 0.3948,
"step": 206
},
{
"epoch": 0.41317365269461076,
"grad_norm": 0.8445413112640381,
"learning_rate": 3.859950575761529e-05,
"loss": 0.4305,
"step": 207
},
{
"epoch": 0.4151696606786427,
"grad_norm": 0.9107454419136047,
"learning_rate": 3.8634388916046025e-05,
"loss": 0.4982,
"step": 208
},
{
"epoch": 0.4171656686626746,
"grad_norm": 0.7765053510665894,
"learning_rate": 3.866910476851757e-05,
"loss": 0.4147,
"step": 209
},
{
"epoch": 0.41916167664670656,
"grad_norm": 0.8410398364067078,
"learning_rate": 3.870365491223199e-05,
"loss": 0.4125,
"step": 210
},
{
"epoch": 0.42115768463073855,
"grad_norm": 0.8012726306915283,
"learning_rate": 3.8738040921628215e-05,
"loss": 0.3941,
"step": 211
},
{
"epoch": 0.4231536926147705,
"grad_norm": 0.8541998863220215,
"learning_rate": 3.877226434881253e-05,
"loss": 0.4327,
"step": 212
},
{
"epoch": 0.4251497005988024,
"grad_norm": 0.8243539929389954,
"learning_rate": 3.880632672397897e-05,
"loss": 0.4303,
"step": 213
},
{
"epoch": 0.42714570858283435,
"grad_norm": 0.8121338486671448,
"learning_rate": 3.884022955581985e-05,
"loss": 0.4301,
"step": 214
},
{
"epoch": 0.4291417165668663,
"grad_norm": 0.9100980758666992,
"learning_rate": 3.887397433192676e-05,
"loss": 0.4208,
"step": 215
},
{
"epoch": 0.4311377245508982,
"grad_norm": 0.748666524887085,
"learning_rate": 3.890756251918219e-05,
"loss": 0.3384,
"step": 216
},
{
"epoch": 0.43313373253493015,
"grad_norm": 0.758114755153656,
"learning_rate": 3.894099556414216e-05,
"loss": 0.3797,
"step": 217
},
{
"epoch": 0.4351297405189621,
"grad_norm": 0.8046779632568359,
"learning_rate": 3.897427489341009e-05,
"loss": 0.4325,
"step": 218
},
{
"epoch": 0.437125748502994,
"grad_norm": 0.872130274772644,
"learning_rate": 3.900740191400198e-05,
"loss": 0.4466,
"step": 219
},
{
"epoch": 0.43912175648702595,
"grad_norm": 0.8052610158920288,
"learning_rate": 3.904037801370344e-05,
"loss": 0.4355,
"step": 220
},
{
"epoch": 0.4411177644710579,
"grad_norm": 0.7204791903495789,
"learning_rate": 3.9073204561418514e-05,
"loss": 0.3465,
"step": 221
},
{
"epoch": 0.4431137724550898,
"grad_norm": 0.7979363799095154,
"learning_rate": 3.9105882907510644e-05,
"loss": 0.4004,
"step": 222
},
{
"epoch": 0.44510978043912175,
"grad_norm": 0.7269802093505859,
"learning_rate": 3.913841438413601e-05,
"loss": 0.4261,
"step": 223
},
{
"epoch": 0.4471057884231537,
"grad_norm": 0.6730761528015137,
"learning_rate": 3.917080030556938e-05,
"loss": 0.3192,
"step": 224
},
{
"epoch": 0.4491017964071856,
"grad_norm": 0.8741471767425537,
"learning_rate": 3.9203041968522716e-05,
"loss": 0.4663,
"step": 225
},
{
"epoch": 0.45109780439121755,
"grad_norm": 0.8293672800064087,
"learning_rate": 3.923514065245669e-05,
"loss": 0.4558,
"step": 226
},
{
"epoch": 0.4530938123752495,
"grad_norm": 0.7904106378555298,
"learning_rate": 3.926709761988538e-05,
"loss": 0.4546,
"step": 227
},
{
"epoch": 0.4550898203592814,
"grad_norm": 0.7640888094902039,
"learning_rate": 3.929891411667424e-05,
"loss": 0.3762,
"step": 228
},
{
"epoch": 0.45708582834331335,
"grad_norm": 0.776006281375885,
"learning_rate": 3.933059137233147e-05,
"loss": 0.4447,
"step": 229
},
{
"epoch": 0.4590818363273453,
"grad_norm": 0.8613069653511047,
"learning_rate": 3.9362130600293214e-05,
"loss": 0.4366,
"step": 230
},
{
"epoch": 0.46107784431137727,
"grad_norm": 0.7828835248947144,
"learning_rate": 3.9393532998202405e-05,
"loss": 0.4434,
"step": 231
},
{
"epoch": 0.4630738522954092,
"grad_norm": 0.7422530055046082,
"learning_rate": 3.942479974818166e-05,
"loss": 0.3755,
"step": 232
},
{
"epoch": 0.46506986027944114,
"grad_norm": 0.7256511449813843,
"learning_rate": 3.945593201710032e-05,
"loss": 0.375,
"step": 233
},
{
"epoch": 0.46706586826347307,
"grad_norm": 0.7594771385192871,
"learning_rate": 3.9486930956835724e-05,
"loss": 0.3985,
"step": 234
},
{
"epoch": 0.469061876247505,
"grad_norm": 0.7957077622413635,
"learning_rate": 3.951779770452894e-05,
"loss": 0.421,
"step": 235
},
{
"epoch": 0.47105788423153694,
"grad_norm": 0.7573441863059998,
"learning_rate": 3.954853338283512e-05,
"loss": 0.4592,
"step": 236
},
{
"epoch": 0.47305389221556887,
"grad_norm": 0.7109091877937317,
"learning_rate": 3.9579139100168404e-05,
"loss": 0.3732,
"step": 237
},
{
"epoch": 0.4750499001996008,
"grad_norm": 0.8672693371772766,
"learning_rate": 3.960961595094187e-05,
"loss": 0.4038,
"step": 238
},
{
"epoch": 0.47704590818363274,
"grad_norm": 0.7573640942573547,
"learning_rate": 3.96399650158023e-05,
"loss": 0.4348,
"step": 239
},
{
"epoch": 0.47904191616766467,
"grad_norm": 0.8784688711166382,
"learning_rate": 3.96701873618601e-05,
"loss": 0.4704,
"step": 240
},
{
"epoch": 0.4810379241516966,
"grad_norm": 0.8110889792442322,
"learning_rate": 3.970028404291448e-05,
"loss": 0.381,
"step": 241
},
{
"epoch": 0.48303393213572854,
"grad_norm": 0.8944825530052185,
"learning_rate": 3.9730256099673865e-05,
"loss": 0.3282,
"step": 242
},
{
"epoch": 0.48502994011976047,
"grad_norm": 0.8505921959877014,
"learning_rate": 3.976010455997187e-05,
"loss": 0.3794,
"step": 243
},
{
"epoch": 0.4870259481037924,
"grad_norm": 1.0878411531448364,
"learning_rate": 3.978983043897883e-05,
"loss": 0.4222,
"step": 244
},
{
"epoch": 0.48902195608782434,
"grad_norm": 0.7262081503868103,
"learning_rate": 3.981943473940888e-05,
"loss": 0.3682,
"step": 245
},
{
"epoch": 0.49101796407185627,
"grad_norm": 1.0304243564605713,
"learning_rate": 3.984891845172299e-05,
"loss": 0.3546,
"step": 246
},
{
"epoch": 0.4930139720558882,
"grad_norm": 0.7483956217765808,
"learning_rate": 3.987828255432777e-05,
"loss": 0.3764,
"step": 247
},
{
"epoch": 0.49500998003992014,
"grad_norm": 1.969207525253296,
"learning_rate": 3.9907528013770276e-05,
"loss": 0.4436,
"step": 248
},
{
"epoch": 0.49700598802395207,
"grad_norm": 0.836520254611969,
"learning_rate": 3.993665578492894e-05,
"loss": 0.4477,
"step": 249
},
{
"epoch": 0.499001996007984,
"grad_norm": 0.8878058791160583,
"learning_rate": 3.9965666811200624e-05,
"loss": 0.355,
"step": 250
},
{
"epoch": 0.500998003992016,
"grad_norm": 0.7905710935592651,
"learning_rate": 3.999456202468397e-05,
"loss": 0.4044,
"step": 251
},
{
"epoch": 0.5029940119760479,
"grad_norm": 0.7035382390022278,
"learning_rate": 4.002334234635907e-05,
"loss": 0.3515,
"step": 252
},
{
"epoch": 0.5049900199600799,
"grad_norm": 1.702528476715088,
"learning_rate": 4.005200868626364e-05,
"loss": 0.4055,
"step": 253
},
{
"epoch": 0.5069860279441117,
"grad_norm": 0.7991278171539307,
"learning_rate": 4.008056194366564e-05,
"loss": 0.4327,
"step": 254
},
{
"epoch": 0.5089820359281437,
"grad_norm": 0.803960382938385,
"learning_rate": 4.010900300723259e-05,
"loss": 0.4187,
"step": 255
},
{
"epoch": 0.5109780439121756,
"grad_norm": 0.7045860886573792,
"learning_rate": 4.013733275519749e-05,
"loss": 0.3947,
"step": 256
},
{
"epoch": 0.5129740518962076,
"grad_norm": 0.7627609372138977,
"learning_rate": 4.016555205552158e-05,
"loss": 0.3808,
"step": 257
},
{
"epoch": 0.5149700598802395,
"grad_norm": 0.7807031869888306,
"learning_rate": 4.0193661766053834e-05,
"loss": 0.4408,
"step": 258
},
{
"epoch": 0.5169660678642715,
"grad_norm": 0.7607232332229614,
"learning_rate": 4.022166273468753e-05,
"loss": 0.3826,
"step": 259
},
{
"epoch": 0.5189620758483033,
"grad_norm": 0.738200306892395,
"learning_rate": 4.024955579951363e-05,
"loss": 0.3403,
"step": 260
},
{
"epoch": 0.5209580838323353,
"grad_norm": 0.7401778101921082,
"learning_rate": 4.027734178897136e-05,
"loss": 0.3927,
"step": 261
},
{
"epoch": 0.5229540918163673,
"grad_norm": 0.8561487793922424,
"learning_rate": 4.030502152199576e-05,
"loss": 0.4247,
"step": 262
},
{
"epoch": 0.5249500998003992,
"grad_norm": 0.7845680117607117,
"learning_rate": 4.033259580816264e-05,
"loss": 0.4284,
"step": 263
},
{
"epoch": 0.5269461077844312,
"grad_norm": 0.8121227622032166,
"learning_rate": 4.036006544783052e-05,
"loss": 0.4534,
"step": 264
},
{
"epoch": 0.5289421157684631,
"grad_norm": 0.7015953660011292,
"learning_rate": 4.0387431232280135e-05,
"loss": 0.3404,
"step": 265
},
{
"epoch": 0.530938123752495,
"grad_norm": 0.7971146702766418,
"learning_rate": 4.041469394385112e-05,
"loss": 0.4455,
"step": 266
},
{
"epoch": 0.5329341317365269,
"grad_norm": 0.7655112147331238,
"learning_rate": 4.0441854356076257e-05,
"loss": 0.4636,
"step": 267
},
{
"epoch": 0.5349301397205589,
"grad_norm": 0.8320984840393066,
"learning_rate": 4.046891323381315e-05,
"loss": 0.3777,
"step": 268
},
{
"epoch": 0.5369261477045908,
"grad_norm": 0.9041264057159424,
"learning_rate": 4.049587133337347e-05,
"loss": 0.4006,
"step": 269
},
{
"epoch": 0.5389221556886228,
"grad_norm": 0.8236355185508728,
"learning_rate": 4.0522729402649793e-05,
"loss": 0.418,
"step": 270
},
{
"epoch": 0.5409181636726547,
"grad_norm": 0.9298795461654663,
"learning_rate": 4.0549488181240096e-05,
"loss": 0.3358,
"step": 271
},
{
"epoch": 0.5429141716566867,
"grad_norm": 0.7561654448509216,
"learning_rate": 4.057614840056998e-05,
"loss": 0.4008,
"step": 272
},
{
"epoch": 0.5449101796407185,
"grad_norm": 0.7712647318840027,
"learning_rate": 4.06027107840126e-05,
"loss": 0.3607,
"step": 273
},
{
"epoch": 0.5469061876247505,
"grad_norm": 0.7622309327125549,
"learning_rate": 4.0629176047006474e-05,
"loss": 0.3567,
"step": 274
},
{
"epoch": 0.5489021956087824,
"grad_norm": 0.7064681649208069,
"learning_rate": 4.065554489717105e-05,
"loss": 0.3528,
"step": 275
},
{
"epoch": 0.5508982035928144,
"grad_norm": 0.8189475536346436,
"learning_rate": 4.068181803442029e-05,
"loss": 0.4062,
"step": 276
},
{
"epoch": 0.5528942115768463,
"grad_norm": 0.8143854737281799,
"learning_rate": 4.0707996151074147e-05,
"loss": 0.4374,
"step": 277
},
{
"epoch": 0.5548902195608783,
"grad_norm": 0.7282266616821289,
"learning_rate": 4.073407993196794e-05,
"loss": 0.4121,
"step": 278
},
{
"epoch": 0.5568862275449101,
"grad_norm": 0.7541894316673279,
"learning_rate": 4.076007005455996e-05,
"loss": 0.4702,
"step": 279
},
{
"epoch": 0.5588822355289421,
"grad_norm": 0.7178213596343994,
"learning_rate": 4.0785967189036986e-05,
"loss": 0.3581,
"step": 280
},
{
"epoch": 0.5608782435129741,
"grad_norm": 0.8269951343536377,
"learning_rate": 4.0811771998418e-05,
"loss": 0.414,
"step": 281
},
{
"epoch": 0.562874251497006,
"grad_norm": 0.6949253082275391,
"learning_rate": 4.083748513865602e-05,
"loss": 0.3549,
"step": 282
},
{
"epoch": 0.564870259481038,
"grad_norm": 0.8457996845245361,
"learning_rate": 4.086310725873818e-05,
"loss": 0.4977,
"step": 283
},
{
"epoch": 0.5668662674650699,
"grad_norm": 0.835884690284729,
"learning_rate": 4.0888639000783966e-05,
"loss": 0.4646,
"step": 284
},
{
"epoch": 0.5688622754491018,
"grad_norm": 0.745847225189209,
"learning_rate": 4.0914081000141844e-05,
"loss": 0.4295,
"step": 285
},
{
"epoch": 0.5708582834331337,
"grad_norm": 0.703731119632721,
"learning_rate": 4.0939433885484055e-05,
"loss": 0.3168,
"step": 286
},
{
"epoch": 0.5728542914171657,
"grad_norm": 0.6979167461395264,
"learning_rate": 4.0964698278899874e-05,
"loss": 0.3373,
"step": 287
},
{
"epoch": 0.5748502994011976,
"grad_norm": 0.7321177124977112,
"learning_rate": 4.0989874795987185e-05,
"loss": 0.3705,
"step": 288
},
{
"epoch": 0.5768463073852296,
"grad_norm": 0.6812002658843994,
"learning_rate": 4.1014964045942465e-05,
"loss": 0.366,
"step": 289
},
{
"epoch": 0.5788423153692615,
"grad_norm": 0.8122517466545105,
"learning_rate": 4.103996663164927e-05,
"loss": 0.4435,
"step": 290
},
{
"epoch": 0.5808383233532934,
"grad_norm": 0.7670555710792542,
"learning_rate": 4.106488314976513e-05,
"loss": 0.471,
"step": 291
},
{
"epoch": 0.5828343313373253,
"grad_norm": 0.7457311749458313,
"learning_rate": 4.108971419080698e-05,
"loss": 0.3138,
"step": 292
},
{
"epoch": 0.5848303393213573,
"grad_norm": 0.8164945244789124,
"learning_rate": 4.111446033923516e-05,
"loss": 0.4394,
"step": 293
},
{
"epoch": 0.5868263473053892,
"grad_norm": 0.7513836622238159,
"learning_rate": 4.113912217353596e-05,
"loss": 0.3741,
"step": 294
},
{
"epoch": 0.5888223552894212,
"grad_norm": 0.7199726700782776,
"learning_rate": 4.116370026630272e-05,
"loss": 0.3116,
"step": 295
},
{
"epoch": 0.590818363273453,
"grad_norm": 0.8232783675193787,
"learning_rate": 4.118819518431564e-05,
"loss": 0.4048,
"step": 296
},
{
"epoch": 0.592814371257485,
"grad_norm": 0.7513990998268127,
"learning_rate": 4.121260748862021e-05,
"loss": 0.4346,
"step": 297
},
{
"epoch": 0.5948103792415169,
"grad_norm": 0.6866230368614197,
"learning_rate": 4.123693773460426e-05,
"loss": 0.3629,
"step": 298
},
{
"epoch": 0.5968063872255489,
"grad_norm": 0.7753307223320007,
"learning_rate": 4.126118647207383e-05,
"loss": 0.4248,
"step": 299
},
{
"epoch": 0.5988023952095808,
"grad_norm": 0.6598490476608276,
"learning_rate": 4.1285354245327715e-05,
"loss": 0.2834,
"step": 300
},
{
"epoch": 0.6007984031936128,
"grad_norm": 0.8024352788925171,
"learning_rate": 4.1309441593230726e-05,
"loss": 0.4276,
"step": 301
},
{
"epoch": 0.6027944111776448,
"grad_norm": 0.7745522260665894,
"learning_rate": 4.133344904928585e-05,
"loss": 0.3925,
"step": 302
},
{
"epoch": 0.6047904191616766,
"grad_norm": 0.6376944184303284,
"learning_rate": 4.1357377141705084e-05,
"loss": 0.2589,
"step": 303
},
{
"epoch": 0.6067864271457086,
"grad_norm": 0.6831088662147522,
"learning_rate": 4.1381226393479236e-05,
"loss": 0.3705,
"step": 304
},
{
"epoch": 0.6087824351297405,
"grad_norm": 0.6832078695297241,
"learning_rate": 4.1404997322446435e-05,
"loss": 0.3637,
"step": 305
},
{
"epoch": 0.6107784431137725,
"grad_norm": 0.7155686020851135,
"learning_rate": 4.142869044135967e-05,
"loss": 0.477,
"step": 306
},
{
"epoch": 0.6127744510978044,
"grad_norm": 0.7326770424842834,
"learning_rate": 4.145230625795311e-05,
"loss": 0.4123,
"step": 307
},
{
"epoch": 0.6147704590818364,
"grad_norm": 0.7184780240058899,
"learning_rate": 4.14758452750074e-05,
"loss": 0.3382,
"step": 308
},
{
"epoch": 0.6167664670658682,
"grad_norm": 0.7494658827781677,
"learning_rate": 4.149930799041392e-05,
"loss": 0.4246,
"step": 309
},
{
"epoch": 0.6187624750499002,
"grad_norm": 0.697238028049469,
"learning_rate": 4.152269489723788e-05,
"loss": 0.4338,
"step": 310
},
{
"epoch": 0.6207584830339321,
"grad_norm": 0.6342530846595764,
"learning_rate": 4.1546006483780626e-05,
"loss": 0.3202,
"step": 311
},
{
"epoch": 0.6227544910179641,
"grad_norm": 0.7153366804122925,
"learning_rate": 4.156924323364072e-05,
"loss": 0.3778,
"step": 312
},
{
"epoch": 0.624750499001996,
"grad_norm": 0.6666108965873718,
"learning_rate": 4.1592405625774144e-05,
"loss": 0.346,
"step": 313
},
{
"epoch": 0.626746506986028,
"grad_norm": 0.7076640725135803,
"learning_rate": 4.161549413455358e-05,
"loss": 0.3827,
"step": 314
},
{
"epoch": 0.6287425149700598,
"grad_norm": 0.7535362839698792,
"learning_rate": 4.163850922982668e-05,
"loss": 0.401,
"step": 315
},
{
"epoch": 0.6307385229540918,
"grad_norm": 0.6954286098480225,
"learning_rate": 4.16614513769734e-05,
"loss": 0.376,
"step": 316
},
{
"epoch": 0.6327345309381237,
"grad_norm": 0.6925478577613831,
"learning_rate": 4.1684321036962526e-05,
"loss": 0.3638,
"step": 317
},
{
"epoch": 0.6347305389221557,
"grad_norm": 0.663144588470459,
"learning_rate": 4.170711866640721e-05,
"loss": 0.3558,
"step": 318
},
{
"epoch": 0.6367265469061876,
"grad_norm": 0.7284447550773621,
"learning_rate": 4.1729844717619684e-05,
"loss": 0.4159,
"step": 319
},
{
"epoch": 0.6387225548902196,
"grad_norm": 0.708574652671814,
"learning_rate": 4.17524996386651e-05,
"loss": 0.3942,
"step": 320
},
{
"epoch": 0.6407185628742516,
"grad_norm": 0.6826594471931458,
"learning_rate": 4.177508387341454e-05,
"loss": 0.3563,
"step": 321
},
{
"epoch": 0.6427145708582834,
"grad_norm": 0.7092903256416321,
"learning_rate": 4.179759786159719e-05,
"loss": 0.4169,
"step": 322
},
{
"epoch": 0.6447105788423154,
"grad_norm": 0.6470283269882202,
"learning_rate": 4.182004203885172e-05,
"loss": 0.3595,
"step": 323
},
{
"epoch": 0.6467065868263473,
"grad_norm": 0.6560471057891846,
"learning_rate": 4.184241683677687e-05,
"loss": 0.3945,
"step": 324
},
{
"epoch": 0.6487025948103793,
"grad_norm": 0.7021344900131226,
"learning_rate": 4.1864722682981245e-05,
"loss": 0.3682,
"step": 325
},
{
"epoch": 0.6506986027944112,
"grad_norm": 0.6736760139465332,
"learning_rate": 4.188696000113232e-05,
"loss": 0.4012,
"step": 326
},
{
"epoch": 0.6526946107784432,
"grad_norm": 0.58335942029953,
"learning_rate": 4.190912921100477e-05,
"loss": 0.2982,
"step": 327
},
{
"epoch": 0.654690618762475,
"grad_norm": 0.7224960327148438,
"learning_rate": 4.1931230728527994e-05,
"loss": 0.3767,
"step": 328
},
{
"epoch": 0.656686626746507,
"grad_norm": 0.7125536203384399,
"learning_rate": 4.195326496583291e-05,
"loss": 0.3918,
"step": 329
},
{
"epoch": 0.6586826347305389,
"grad_norm": 0.7161789536476135,
"learning_rate": 4.1975232331298125e-05,
"loss": 0.3727,
"step": 330
},
{
"epoch": 0.6606786427145709,
"grad_norm": 0.7045012712478638,
"learning_rate": 4.1997133229595316e-05,
"loss": 0.4168,
"step": 331
},
{
"epoch": 0.6626746506986028,
"grad_norm": 0.7229664921760559,
"learning_rate": 4.201896806173394e-05,
"loss": 0.406,
"step": 332
},
{
"epoch": 0.6646706586826348,
"grad_norm": 0.6685640811920166,
"learning_rate": 4.2040737225105335e-05,
"loss": 0.3348,
"step": 333
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.6416003108024597,
"learning_rate": 4.206244111352608e-05,
"loss": 0.3134,
"step": 334
},
{
"epoch": 0.6686626746506986,
"grad_norm": 0.6860243082046509,
"learning_rate": 4.2084080117280756e-05,
"loss": 0.3855,
"step": 335
},
{
"epoch": 0.6706586826347305,
"grad_norm": 0.751287579536438,
"learning_rate": 4.210565462316407e-05,
"loss": 0.4388,
"step": 336
},
{
"epoch": 0.6726546906187625,
"grad_norm": 0.7298620939254761,
"learning_rate": 4.2127165014522315e-05,
"loss": 0.4084,
"step": 337
},
{
"epoch": 0.6746506986027944,
"grad_norm": 0.7535167336463928,
"learning_rate": 4.214861167129425e-05,
"loss": 0.3971,
"step": 338
},
{
"epoch": 0.6766467065868264,
"grad_norm": 0.6288606524467468,
"learning_rate": 4.2169994970051365e-05,
"loss": 0.3184,
"step": 339
},
{
"epoch": 0.6786427145708582,
"grad_norm": 0.6942071914672852,
"learning_rate": 4.219131528403759e-05,
"loss": 0.4085,
"step": 340
},
{
"epoch": 0.6806387225548902,
"grad_norm": 0.7049132585525513,
"learning_rate": 4.22125729832083e-05,
"loss": 0.3799,
"step": 341
},
{
"epoch": 0.6826347305389222,
"grad_norm": 0.6633714437484741,
"learning_rate": 4.2233768434268914e-05,
"loss": 0.3615,
"step": 342
},
{
"epoch": 0.6846307385229541,
"grad_norm": 0.7143837809562683,
"learning_rate": 4.225490200071284e-05,
"loss": 0.397,
"step": 343
},
{
"epoch": 0.6866267465069861,
"grad_norm": 0.6334770917892456,
"learning_rate": 4.227597404285883e-05,
"loss": 0.3192,
"step": 344
},
{
"epoch": 0.688622754491018,
"grad_norm": 0.6318526268005371,
"learning_rate": 4.229698491788791e-05,
"loss": 0.3409,
"step": 345
},
{
"epoch": 0.6906187624750499,
"grad_norm": 0.6425897479057312,
"learning_rate": 4.231793497987961e-05,
"loss": 0.3506,
"step": 346
},
{
"epoch": 0.6926147704590818,
"grad_norm": 0.6882063150405884,
"learning_rate": 4.2338824579847904e-05,
"loss": 0.3697,
"step": 347
},
{
"epoch": 0.6946107784431138,
"grad_norm": 0.6814457774162292,
"learning_rate": 4.235965406577636e-05,
"loss": 0.4179,
"step": 348
},
{
"epoch": 0.6966067864271457,
"grad_norm": 0.7089083790779114,
"learning_rate": 4.2380423782653e-05,
"loss": 0.358,
"step": 349
},
{
"epoch": 0.6986027944111777,
"grad_norm": 0.671987771987915,
"learning_rate": 4.240113407250459e-05,
"loss": 0.4223,
"step": 350
},
{
"epoch": 0.7005988023952096,
"grad_norm": 0.6932473182678223,
"learning_rate": 4.24217852744304e-05,
"loss": 0.4283,
"step": 351
},
{
"epoch": 0.7025948103792415,
"grad_norm": 0.6401710510253906,
"learning_rate": 4.244237772463552e-05,
"loss": 0.3277,
"step": 352
},
{
"epoch": 0.7045908183632734,
"grad_norm": 0.5808695554733276,
"learning_rate": 4.246291175646371e-05,
"loss": 0.3153,
"step": 353
},
{
"epoch": 0.7065868263473054,
"grad_norm": 0.5929372310638428,
"learning_rate": 4.24833877004298e-05,
"loss": 0.2934,
"step": 354
},
{
"epoch": 0.7085828343313373,
"grad_norm": 0.6138365864753723,
"learning_rate": 4.250380588425157e-05,
"loss": 0.2647,
"step": 355
},
{
"epoch": 0.7105788423153693,
"grad_norm": 0.69126957654953,
"learning_rate": 4.2524166632881255e-05,
"loss": 0.3777,
"step": 356
},
{
"epoch": 0.7125748502994012,
"grad_norm": 0.618993878364563,
"learning_rate": 4.254447026853656e-05,
"loss": 0.2874,
"step": 357
},
{
"epoch": 0.7145708582834331,
"grad_norm": 0.6197064518928528,
"learning_rate": 4.2564717110731244e-05,
"loss": 0.3137,
"step": 358
},
{
"epoch": 0.716566866267465,
"grad_norm": 0.6574029326438904,
"learning_rate": 4.258490747630532e-05,
"loss": 0.3366,
"step": 359
},
{
"epoch": 0.718562874251497,
"grad_norm": 0.6827244162559509,
"learning_rate": 4.260504167945479e-05,
"loss": 0.367,
"step": 360
},
{
"epoch": 0.720558882235529,
"grad_norm": 0.6920093297958374,
"learning_rate": 4.2625120031760965e-05,
"loss": 0.3473,
"step": 361
},
{
"epoch": 0.7225548902195609,
"grad_norm": 0.6315056085586548,
"learning_rate": 4.264514284221944e-05,
"loss": 0.3477,
"step": 362
},
{
"epoch": 0.7245508982035929,
"grad_norm": 0.6894274950027466,
"learning_rate": 4.266511041726854e-05,
"loss": 0.3818,
"step": 363
},
{
"epoch": 0.7265469061876247,
"grad_norm": 0.7182605266571045,
"learning_rate": 4.26850230608176e-05,
"loss": 0.3959,
"step": 364
},
{
"epoch": 0.7285429141716567,
"grad_norm": 0.6431974172592163,
"learning_rate": 4.2704881074274584e-05,
"loss": 0.3484,
"step": 365
},
{
"epoch": 0.7305389221556886,
"grad_norm": 0.6523058414459229,
"learning_rate": 4.272468475657351e-05,
"loss": 0.3315,
"step": 366
},
{
"epoch": 0.7325349301397206,
"grad_norm": 0.7160993218421936,
"learning_rate": 4.2744434404201497e-05,
"loss": 0.3806,
"step": 367
},
{
"epoch": 0.7345309381237525,
"grad_norm": 0.6819020509719849,
"learning_rate": 4.27641303112253e-05,
"loss": 0.3889,
"step": 368
},
{
"epoch": 0.7365269461077845,
"grad_norm": 0.5881057381629944,
"learning_rate": 4.278377276931767e-05,
"loss": 0.2647,
"step": 369
},
{
"epoch": 0.7385229540918163,
"grad_norm": 1.0767422914505005,
"learning_rate": 4.2803362067783256e-05,
"loss": 0.3912,
"step": 370
},
{
"epoch": 0.7405189620758483,
"grad_norm": 0.6878696084022522,
"learning_rate": 4.2822898493584104e-05,
"loss": 0.4216,
"step": 371
},
{
"epoch": 0.7425149700598802,
"grad_norm": 0.6871569752693176,
"learning_rate": 4.284238233136496e-05,
"loss": 0.395,
"step": 372
},
{
"epoch": 0.7445109780439122,
"grad_norm": 0.6874458193778992,
"learning_rate": 4.286181386347813e-05,
"loss": 0.3683,
"step": 373
},
{
"epoch": 0.7465069860279441,
"grad_norm": 0.6394293308258057,
"learning_rate": 4.288119337000801e-05,
"loss": 0.3518,
"step": 374
},
{
"epoch": 0.7485029940119761,
"grad_norm": 0.67393559217453,
"learning_rate": 4.2900521128795315e-05,
"loss": 0.4018,
"step": 375
},
{
"epoch": 0.7504990019960079,
"grad_norm": 0.6365067958831787,
"learning_rate": 4.291979741546102e-05,
"loss": 0.3719,
"step": 376
},
{
"epoch": 0.7524950099800399,
"grad_norm": 0.6792694926261902,
"learning_rate": 4.293902250342989e-05,
"loss": 0.3623,
"step": 377
},
{
"epoch": 0.7544910179640718,
"grad_norm": 0.794163167476654,
"learning_rate": 4.295819666395376e-05,
"loss": 0.3945,
"step": 378
},
{
"epoch": 0.7564870259481038,
"grad_norm": 0.7103076577186584,
"learning_rate": 4.297732016613454e-05,
"loss": 0.4585,
"step": 379
},
{
"epoch": 0.7584830339321357,
"grad_norm": 0.6877479553222656,
"learning_rate": 4.299639327694684e-05,
"loss": 0.4261,
"step": 380
},
{
"epoch": 0.7604790419161677,
"grad_norm": 0.6512800455093384,
"learning_rate": 4.3015416261260325e-05,
"loss": 0.336,
"step": 381
},
{
"epoch": 0.7624750499001997,
"grad_norm": 0.6555919051170349,
"learning_rate": 4.303438938186182e-05,
"loss": 0.3949,
"step": 382
},
{
"epoch": 0.7644710578842315,
"grad_norm": 0.6375437378883362,
"learning_rate": 4.305331289947705e-05,
"loss": 0.348,
"step": 383
},
{
"epoch": 0.7664670658682635,
"grad_norm": 0.6899069547653198,
"learning_rate": 4.3072187072792184e-05,
"loss": 0.3715,
"step": 384
},
{
"epoch": 0.7684630738522954,
"grad_norm": 0.6571375727653503,
"learning_rate": 4.309101215847502e-05,
"loss": 0.3471,
"step": 385
},
{
"epoch": 0.7704590818363274,
"grad_norm": 0.6866909265518188,
"learning_rate": 4.3109788411195924e-05,
"loss": 0.3721,
"step": 386
},
{
"epoch": 0.7724550898203593,
"grad_norm": 0.6416053175926208,
"learning_rate": 4.312851608364853e-05,
"loss": 0.3501,
"step": 387
},
{
"epoch": 0.7744510978043913,
"grad_norm": 0.6585414409637451,
"learning_rate": 4.314719542657013e-05,
"loss": 0.3446,
"step": 388
},
{
"epoch": 0.7764471057884231,
"grad_norm": 0.6449529528617859,
"learning_rate": 4.3165826688761796e-05,
"loss": 0.31,
"step": 389
},
{
"epoch": 0.7784431137724551,
"grad_norm": 0.6616773009300232,
"learning_rate": 4.318441011710833e-05,
"loss": 0.3356,
"step": 390
},
{
"epoch": 0.780439121756487,
"grad_norm": 0.681754469871521,
"learning_rate": 4.3202945956597786e-05,
"loss": 0.3543,
"step": 391
},
{
"epoch": 0.782435129740519,
"grad_norm": 0.6211993098258972,
"learning_rate": 4.3221434450340956e-05,
"loss": 0.3157,
"step": 392
},
{
"epoch": 0.7844311377245509,
"grad_norm": 0.6262781620025635,
"learning_rate": 4.323987583959045e-05,
"loss": 0.3533,
"step": 393
},
{
"epoch": 0.7864271457085829,
"grad_norm": 0.6640245318412781,
"learning_rate": 4.325827036375957e-05,
"loss": 0.3742,
"step": 394
},
{
"epoch": 0.7884231536926147,
"grad_norm": 0.6164320111274719,
"learning_rate": 4.327661826044101e-05,
"loss": 0.3472,
"step": 395
},
{
"epoch": 0.7904191616766467,
"grad_norm": 0.6439725756645203,
"learning_rate": 4.329491976542521e-05,
"loss": 0.359,
"step": 396
},
{
"epoch": 0.7924151696606786,
"grad_norm": 0.7187615036964417,
"learning_rate": 4.331317511271859e-05,
"loss": 0.4445,
"step": 397
},
{
"epoch": 0.7944111776447106,
"grad_norm": 0.660010039806366,
"learning_rate": 4.333138453456147e-05,
"loss": 0.3213,
"step": 398
},
{
"epoch": 0.7964071856287425,
"grad_norm": 0.7590385675430298,
"learning_rate": 4.334954826144581e-05,
"loss": 0.3359,
"step": 399
},
{
"epoch": 0.7984031936127745,
"grad_norm": 0.6344367861747742,
"learning_rate": 4.336766652213271e-05,
"loss": 0.3542,
"step": 400
},
{
"epoch": 0.8003992015968064,
"grad_norm": 0.6679601073265076,
"learning_rate": 4.338573954366971e-05,
"loss": 0.3642,
"step": 401
},
{
"epoch": 0.8023952095808383,
"grad_norm": 0.6402161121368408,
"learning_rate": 4.340376755140784e-05,
"loss": 0.3603,
"step": 402
},
{
"epoch": 0.8043912175648703,
"grad_norm": 0.7084898948669434,
"learning_rate": 4.342175076901849e-05,
"loss": 0.3817,
"step": 403
},
{
"epoch": 0.8063872255489022,
"grad_norm": 0.6191865801811218,
"learning_rate": 4.343968941851009e-05,
"loss": 0.3017,
"step": 404
},
{
"epoch": 0.8083832335329342,
"grad_norm": 0.6750943660736084,
"learning_rate": 4.345758372024448e-05,
"loss": 0.3949,
"step": 405
},
{
"epoch": 0.810379241516966,
"grad_norm": 0.6468753814697266,
"learning_rate": 4.347543389295324e-05,
"loss": 0.3668,
"step": 406
},
{
"epoch": 0.812375249500998,
"grad_norm": 0.6904520988464355,
"learning_rate": 4.3493240153753666e-05,
"loss": 0.3499,
"step": 407
},
{
"epoch": 0.8143712574850299,
"grad_norm": 0.6204891800880432,
"learning_rate": 4.3511002718164666e-05,
"loss": 0.3304,
"step": 408
},
{
"epoch": 0.8163672654690619,
"grad_norm": 0.6633168458938599,
"learning_rate": 4.352872180012237e-05,
"loss": 0.3337,
"step": 409
},
{
"epoch": 0.8183632734530938,
"grad_norm": 0.8691318035125732,
"learning_rate": 4.35463976119956e-05,
"loss": 0.4502,
"step": 410
},
{
"epoch": 0.8203592814371258,
"grad_norm": 0.7373143434524536,
"learning_rate": 4.356403036460115e-05,
"loss": 0.4128,
"step": 411
},
{
"epoch": 0.8223552894211577,
"grad_norm": 0.6885534524917603,
"learning_rate": 4.3581620267218916e-05,
"loss": 0.3341,
"step": 412
},
{
"epoch": 0.8243512974051896,
"grad_norm": 0.6862485408782959,
"learning_rate": 4.359916752760669e-05,
"loss": 0.3498,
"step": 413
},
{
"epoch": 0.8263473053892215,
"grad_norm": 0.6959711313247681,
"learning_rate": 4.361667235201499e-05,
"loss": 0.3796,
"step": 414
},
{
"epoch": 0.8283433133732535,
"grad_norm": 0.7265036106109619,
"learning_rate": 4.363413494520154e-05,
"loss": 0.3911,
"step": 415
},
{
"epoch": 0.8303393213572854,
"grad_norm": 0.6805566549301147,
"learning_rate": 4.365155551044572e-05,
"loss": 0.367,
"step": 416
},
{
"epoch": 0.8323353293413174,
"grad_norm": 0.6219791173934937,
"learning_rate": 4.366893424956263e-05,
"loss": 0.289,
"step": 417
},
{
"epoch": 0.8343313373253493,
"grad_norm": 0.6582449674606323,
"learning_rate": 4.368627136291726e-05,
"loss": 0.2747,
"step": 418
},
{
"epoch": 0.8363273453093812,
"grad_norm": 0.6985988616943359,
"learning_rate": 4.370356704943825e-05,
"loss": 0.3435,
"step": 419
},
{
"epoch": 0.8383233532934131,
"grad_norm": 0.6607214212417603,
"learning_rate": 4.372082150663168e-05,
"loss": 0.3645,
"step": 420
},
{
"epoch": 0.8403193612774451,
"grad_norm": 0.723174512386322,
"learning_rate": 4.3738034930594475e-05,
"loss": 0.3672,
"step": 421
},
{
"epoch": 0.8423153692614771,
"grad_norm": 0.6832453012466431,
"learning_rate": 4.3755207516027904e-05,
"loss": 0.3806,
"step": 422
},
{
"epoch": 0.844311377245509,
"grad_norm": 0.6922501921653748,
"learning_rate": 4.377233945625071e-05,
"loss": 0.4031,
"step": 423
},
{
"epoch": 0.846307385229541,
"grad_norm": 0.6647071242332458,
"learning_rate": 4.378943094321221e-05,
"loss": 0.3628,
"step": 424
},
{
"epoch": 0.8483033932135728,
"grad_norm": 0.6893953084945679,
"learning_rate": 4.3806482167505196e-05,
"loss": 0.3434,
"step": 425
},
{
"epoch": 0.8502994011976048,
"grad_norm": 0.8566087484359741,
"learning_rate": 4.382349331837866e-05,
"loss": 0.3803,
"step": 426
},
{
"epoch": 0.8522954091816367,
"grad_norm": 0.7948191165924072,
"learning_rate": 4.3840464583750404e-05,
"loss": 0.3627,
"step": 427
},
{
"epoch": 0.8542914171656687,
"grad_norm": 0.6731837391853333,
"learning_rate": 4.385739615021954e-05,
"loss": 0.395,
"step": 428
},
{
"epoch": 0.8562874251497006,
"grad_norm": 0.6760764122009277,
"learning_rate": 4.387428820307874e-05,
"loss": 0.3627,
"step": 429
},
{
"epoch": 0.8582834331337326,
"grad_norm": 0.7794198989868164,
"learning_rate": 4.3891140926326446e-05,
"loss": 0.3166,
"step": 430
},
{
"epoch": 0.8602794411177644,
"grad_norm": 0.7948319911956787,
"learning_rate": 4.390795450267886e-05,
"loss": 0.3634,
"step": 431
},
{
"epoch": 0.8622754491017964,
"grad_norm": 0.6758688688278198,
"learning_rate": 4.3924729113581876e-05,
"loss": 0.3103,
"step": 432
},
{
"epoch": 0.8642714570858283,
"grad_norm": 1.493560791015625,
"learning_rate": 4.394146493922276e-05,
"loss": 0.3551,
"step": 433
},
{
"epoch": 0.8662674650698603,
"grad_norm": 0.6501355171203613,
"learning_rate": 4.395816215854185e-05,
"loss": 0.3433,
"step": 434
},
{
"epoch": 0.8682634730538922,
"grad_norm": 0.7338974475860596,
"learning_rate": 4.397482094924396e-05,
"loss": 0.3748,
"step": 435
},
{
"epoch": 0.8702594810379242,
"grad_norm": 0.7021346688270569,
"learning_rate": 4.399144148780977e-05,
"loss": 0.3988,
"step": 436
},
{
"epoch": 0.872255489021956,
"grad_norm": 0.8264355659484863,
"learning_rate": 4.400802394950703e-05,
"loss": 0.3821,
"step": 437
},
{
"epoch": 0.874251497005988,
"grad_norm": 0.7332090139389038,
"learning_rate": 4.402456850840166e-05,
"loss": 0.3212,
"step": 438
},
{
"epoch": 0.8762475049900199,
"grad_norm": 0.7158175706863403,
"learning_rate": 4.4041075337368695e-05,
"loss": 0.3014,
"step": 439
},
{
"epoch": 0.8782435129740519,
"grad_norm": 0.6871099472045898,
"learning_rate": 4.405754460810312e-05,
"loss": 0.3363,
"step": 440
},
{
"epoch": 0.8802395209580839,
"grad_norm": 0.7581283450126648,
"learning_rate": 4.407397649113065e-05,
"loss": 0.3706,
"step": 441
},
{
"epoch": 0.8822355289421158,
"grad_norm": 0.7075430154800415,
"learning_rate": 4.40903711558182e-05,
"loss": 0.3625,
"step": 442
},
{
"epoch": 0.8842315369261478,
"grad_norm": 0.6902301907539368,
"learning_rate": 4.41067287703845e-05,
"loss": 0.3459,
"step": 443
},
{
"epoch": 0.8862275449101796,
"grad_norm": 0.7632633447647095,
"learning_rate": 4.412304950191033e-05,
"loss": 0.3863,
"step": 444
},
{
"epoch": 0.8882235528942116,
"grad_norm": 0.8091756701469421,
"learning_rate": 4.413933351634886e-05,
"loss": 0.3873,
"step": 445
},
{
"epoch": 0.8902195608782435,
"grad_norm": 0.7229244709014893,
"learning_rate": 4.4155580978535707e-05,
"loss": 0.3199,
"step": 446
},
{
"epoch": 0.8922155688622755,
"grad_norm": 0.6914481520652771,
"learning_rate": 4.417179205219895e-05,
"loss": 0.3679,
"step": 447
},
{
"epoch": 0.8942115768463074,
"grad_norm": 0.6364032030105591,
"learning_rate": 4.418796689996907e-05,
"loss": 0.2962,
"step": 448
},
{
"epoch": 0.8962075848303394,
"grad_norm": 0.7445045113563538,
"learning_rate": 4.420410568338872e-05,
"loss": 0.4021,
"step": 449
},
{
"epoch": 0.8982035928143712,
"grad_norm": 0.6447579264640808,
"learning_rate": 4.42202085629224e-05,
"loss": 0.3129,
"step": 450
},
{
"epoch": 0.9001996007984032,
"grad_norm": 0.7040254473686218,
"learning_rate": 4.423627569796601e-05,
"loss": 0.3672,
"step": 451
},
{
"epoch": 0.9021956087824351,
"grad_norm": 0.6750066876411438,
"learning_rate": 4.425230724685638e-05,
"loss": 0.4024,
"step": 452
},
{
"epoch": 0.9041916167664671,
"grad_norm": 0.7186387181282043,
"learning_rate": 4.4268303366880536e-05,
"loss": 0.355,
"step": 453
},
{
"epoch": 0.906187624750499,
"grad_norm": 0.7389270663261414,
"learning_rate": 4.428426421428507e-05,
"loss": 0.4207,
"step": 454
},
{
"epoch": 0.908183632734531,
"grad_norm": 0.6795611381530762,
"learning_rate": 4.430018994428521e-05,
"loss": 0.3068,
"step": 455
},
{
"epoch": 0.9101796407185628,
"grad_norm": 0.6613329648971558,
"learning_rate": 4.431608071107392e-05,
"loss": 0.3828,
"step": 456
},
{
"epoch": 0.9121756487025948,
"grad_norm": 0.7048102021217346,
"learning_rate": 4.433193666783084e-05,
"loss": 0.3921,
"step": 457
},
{
"epoch": 0.9141716566866267,
"grad_norm": 0.7187650203704834,
"learning_rate": 4.4347757966731156e-05,
"loss": 0.2997,
"step": 458
},
{
"epoch": 0.9161676646706587,
"grad_norm": 0.7008907794952393,
"learning_rate": 4.436354475895436e-05,
"loss": 0.3478,
"step": 459
},
{
"epoch": 0.9181636726546906,
"grad_norm": 0.6574254035949707,
"learning_rate": 4.437929719469291e-05,
"loss": 0.317,
"step": 460
},
{
"epoch": 0.9201596806387226,
"grad_norm": 0.6908730864524841,
"learning_rate": 4.4395015423160807e-05,
"loss": 0.3268,
"step": 461
},
{
"epoch": 0.9221556886227545,
"grad_norm": 0.676114559173584,
"learning_rate": 4.4410699592602094e-05,
"loss": 0.3791,
"step": 462
},
{
"epoch": 0.9241516966067864,
"grad_norm": 0.6226547956466675,
"learning_rate": 4.442634985029922e-05,
"loss": 0.36,
"step": 463
},
{
"epoch": 0.9261477045908184,
"grad_norm": 0.6422531604766846,
"learning_rate": 4.444196634258136e-05,
"loss": 0.379,
"step": 464
},
{
"epoch": 0.9281437125748503,
"grad_norm": 0.7371797561645508,
"learning_rate": 4.4457549214832566e-05,
"loss": 0.3696,
"step": 465
},
{
"epoch": 0.9301397205588823,
"grad_norm": 0.6225396394729614,
"learning_rate": 4.44730986115e-05,
"loss": 0.345,
"step": 466
},
{
"epoch": 0.9321357285429142,
"grad_norm": 0.6568498611450195,
"learning_rate": 4.448861467610187e-05,
"loss": 0.4367,
"step": 467
},
{
"epoch": 0.9341317365269461,
"grad_norm": 0.6361973881721497,
"learning_rate": 4.4504097551235406e-05,
"loss": 0.3615,
"step": 468
},
{
"epoch": 0.936127744510978,
"grad_norm": 0.5645039081573486,
"learning_rate": 4.4519547378584725e-05,
"loss": 0.2511,
"step": 469
},
{
"epoch": 0.93812375249501,
"grad_norm": 1.5839265584945679,
"learning_rate": 4.453496429892863e-05,
"loss": 0.3438,
"step": 470
},
{
"epoch": 0.9401197604790419,
"grad_norm": 0.7127808928489685,
"learning_rate": 4.455034845214827e-05,
"loss": 0.4078,
"step": 471
},
{
"epoch": 0.9421157684630739,
"grad_norm": 0.9536606073379517,
"learning_rate": 4.4565699977234796e-05,
"loss": 0.3297,
"step": 472
},
{
"epoch": 0.9441117764471058,
"grad_norm": 0.6458728313446045,
"learning_rate": 4.458101901229686e-05,
"loss": 0.3305,
"step": 473
},
{
"epoch": 0.9461077844311377,
"grad_norm": 0.7509250640869141,
"learning_rate": 4.459630569456809e-05,
"loss": 0.345,
"step": 474
},
{
"epoch": 0.9481037924151696,
"grad_norm": 2.1286840438842773,
"learning_rate": 4.461156016041444e-05,
"loss": 0.4174,
"step": 475
},
{
"epoch": 0.9500998003992016,
"grad_norm": 0.668644905090332,
"learning_rate": 4.462678254534156e-05,
"loss": 0.3657,
"step": 476
},
{
"epoch": 0.9520958083832335,
"grad_norm": 0.7153406739234924,
"learning_rate": 4.464197298400191e-05,
"loss": 0.3401,
"step": 477
},
{
"epoch": 0.9540918163672655,
"grad_norm": 0.62980717420578,
"learning_rate": 4.4657131610201994e-05,
"loss": 0.316,
"step": 478
},
{
"epoch": 0.9560878243512974,
"grad_norm": 0.733650803565979,
"learning_rate": 4.467225855690939e-05,
"loss": 0.4096,
"step": 479
},
{
"epoch": 0.9580838323353293,
"grad_norm": 0.9371464252471924,
"learning_rate": 4.468735395625979e-05,
"loss": 0.4383,
"step": 480
},
{
"epoch": 0.9600798403193613,
"grad_norm": 0.6547588109970093,
"learning_rate": 4.470241793956387e-05,
"loss": 0.3269,
"step": 481
},
{
"epoch": 0.9620758483033932,
"grad_norm": 0.6767633557319641,
"learning_rate": 4.471745063731416e-05,
"loss": 0.338,
"step": 482
},
{
"epoch": 0.9640718562874252,
"grad_norm": 0.691611111164093,
"learning_rate": 4.473245217919187e-05,
"loss": 0.3583,
"step": 483
},
{
"epoch": 0.9660678642714571,
"grad_norm": 0.6319297552108765,
"learning_rate": 4.474742269407355e-05,
"loss": 0.333,
"step": 484
},
{
"epoch": 0.9680638722554891,
"grad_norm": 0.6804649829864502,
"learning_rate": 4.476236231003773e-05,
"loss": 0.388,
"step": 485
},
{
"epoch": 0.9700598802395209,
"grad_norm": 0.7119168043136597,
"learning_rate": 4.477727115437156e-05,
"loss": 0.3867,
"step": 486
},
{
"epoch": 0.9720558882235529,
"grad_norm": 0.6172801852226257,
"learning_rate": 4.479214935357724e-05,
"loss": 0.312,
"step": 487
},
{
"epoch": 0.9740518962075848,
"grad_norm": 0.8452144265174866,
"learning_rate": 4.480699703337852e-05,
"loss": 0.4059,
"step": 488
},
{
"epoch": 0.9760479041916168,
"grad_norm": 0.6802703142166138,
"learning_rate": 4.4821814318727016e-05,
"loss": 0.3789,
"step": 489
},
{
"epoch": 0.9780439121756487,
"grad_norm": 0.6583143472671509,
"learning_rate": 4.483660133380856e-05,
"loss": 0.3354,
"step": 490
},
{
"epoch": 0.9800399201596807,
"grad_norm": 0.6605017781257629,
"learning_rate": 4.485135820204948e-05,
"loss": 0.3842,
"step": 491
},
{
"epoch": 0.9820359281437125,
"grad_norm": 0.7111901640892029,
"learning_rate": 4.486608504612267e-05,
"loss": 0.432,
"step": 492
},
{
"epoch": 0.9840319361277445,
"grad_norm": 0.6553547978401184,
"learning_rate": 4.488078198795383e-05,
"loss": 0.3503,
"step": 493
},
{
"epoch": 0.9860279441117764,
"grad_norm": 0.8542457818984985,
"learning_rate": 4.489544914872745e-05,
"loss": 0.354,
"step": 494
},
{
"epoch": 0.9880239520958084,
"grad_norm": 0.680438220500946,
"learning_rate": 4.4910086648892815e-05,
"loss": 0.3528,
"step": 495
},
{
"epoch": 0.9900199600798403,
"grad_norm": 0.6407065987586975,
"learning_rate": 4.4924694608169965e-05,
"loss": 0.3698,
"step": 496
},
{
"epoch": 0.9920159680638723,
"grad_norm": 0.6616628170013428,
"learning_rate": 4.4939273145555536e-05,
"loss": 0.3878,
"step": 497
},
{
"epoch": 0.9940119760479041,
"grad_norm": 0.617494523525238,
"learning_rate": 4.495382237932863e-05,
"loss": 0.3155,
"step": 498
},
{
"epoch": 0.9960079840319361,
"grad_norm": 0.672020435333252,
"learning_rate": 4.4968342427056505e-05,
"loss": 0.3425,
"step": 499
},
{
"epoch": 0.998003992015968,
"grad_norm": 0.6575382351875305,
"learning_rate": 4.498283340560031e-05,
"loss": 0.3599,
"step": 500
},
{
"epoch": 1.0,
"grad_norm": 0.6533491015434265,
"learning_rate": 4.499729543112076e-05,
"loss": 0.3201,
"step": 501
},
{
"epoch": 1.0,
"step": 501,
"total_flos": 5.842272600604017e+17,
"train_loss": 0.47904590670458097,
"train_runtime": 1388.8864,
"train_samples_per_second": 2.881,
"train_steps_per_second": 0.361
}
],
"logging_steps": 1.0,
"max_steps": 501,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.842272600604017e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}