lillian039's picture
Model save
498e960 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9967051070840198,
"eval_steps": 500,
"global_step": 606,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032948929159802307,
"grad_norm": 12.442080297711913,
"learning_rate": 1.639344262295082e-07,
"loss": 0.383,
"step": 1
},
{
"epoch": 0.006589785831960461,
"grad_norm": 13.440096974039598,
"learning_rate": 3.278688524590164e-07,
"loss": 0.3663,
"step": 2
},
{
"epoch": 0.009884678747940691,
"grad_norm": 13.796855251219549,
"learning_rate": 4.918032786885246e-07,
"loss": 0.3663,
"step": 3
},
{
"epoch": 0.013179571663920923,
"grad_norm": 12.70572090942759,
"learning_rate": 6.557377049180328e-07,
"loss": 0.3061,
"step": 4
},
{
"epoch": 0.016474464579901153,
"grad_norm": 11.468076158597158,
"learning_rate": 8.196721311475409e-07,
"loss": 0.3387,
"step": 5
},
{
"epoch": 0.019769357495881382,
"grad_norm": 10.302737632617346,
"learning_rate": 9.836065573770493e-07,
"loss": 0.2814,
"step": 6
},
{
"epoch": 0.023064250411861616,
"grad_norm": 9.415064505565756,
"learning_rate": 1.1475409836065575e-06,
"loss": 0.2815,
"step": 7
},
{
"epoch": 0.026359143327841845,
"grad_norm": 8.420756675130125,
"learning_rate": 1.3114754098360657e-06,
"loss": 0.2483,
"step": 8
},
{
"epoch": 0.029654036243822075,
"grad_norm": 5.932373113068906,
"learning_rate": 1.4754098360655739e-06,
"loss": 0.1744,
"step": 9
},
{
"epoch": 0.032948929159802305,
"grad_norm": 4.602925255551132,
"learning_rate": 1.6393442622950819e-06,
"loss": 0.1821,
"step": 10
},
{
"epoch": 0.036243822075782535,
"grad_norm": 4.1437512962542025,
"learning_rate": 1.8032786885245903e-06,
"loss": 0.1557,
"step": 11
},
{
"epoch": 0.039538714991762765,
"grad_norm": 4.1008407646409495,
"learning_rate": 1.9672131147540985e-06,
"loss": 0.1901,
"step": 12
},
{
"epoch": 0.042833607907743,
"grad_norm": 3.2895218176449177,
"learning_rate": 2.1311475409836067e-06,
"loss": 0.1695,
"step": 13
},
{
"epoch": 0.04612850082372323,
"grad_norm": 2.578808113814672,
"learning_rate": 2.295081967213115e-06,
"loss": 0.1431,
"step": 14
},
{
"epoch": 0.04942339373970346,
"grad_norm": 2.0925168309074804,
"learning_rate": 2.459016393442623e-06,
"loss": 0.1459,
"step": 15
},
{
"epoch": 0.05271828665568369,
"grad_norm": 5.776302012227753,
"learning_rate": 2.6229508196721314e-06,
"loss": 0.2116,
"step": 16
},
{
"epoch": 0.05601317957166392,
"grad_norm": 4.863535909815622,
"learning_rate": 2.786885245901639e-06,
"loss": 0.157,
"step": 17
},
{
"epoch": 0.05930807248764415,
"grad_norm": 2.709665323729833,
"learning_rate": 2.9508196721311478e-06,
"loss": 0.1463,
"step": 18
},
{
"epoch": 0.06260296540362438,
"grad_norm": 2.911144968708877,
"learning_rate": 3.114754098360656e-06,
"loss": 0.1283,
"step": 19
},
{
"epoch": 0.06589785831960461,
"grad_norm": 2.4985115549367873,
"learning_rate": 3.2786885245901638e-06,
"loss": 0.13,
"step": 20
},
{
"epoch": 0.06919275123558484,
"grad_norm": 2.158826062926261,
"learning_rate": 3.4426229508196724e-06,
"loss": 0.1135,
"step": 21
},
{
"epoch": 0.07248764415156507,
"grad_norm": 1.8839820680217123,
"learning_rate": 3.6065573770491806e-06,
"loss": 0.1196,
"step": 22
},
{
"epoch": 0.0757825370675453,
"grad_norm": 2.208880925801888,
"learning_rate": 3.7704918032786884e-06,
"loss": 0.1642,
"step": 23
},
{
"epoch": 0.07907742998352553,
"grad_norm": 1.948969438238695,
"learning_rate": 3.934426229508197e-06,
"loss": 0.1057,
"step": 24
},
{
"epoch": 0.08237232289950576,
"grad_norm": 1.5855531759235064,
"learning_rate": 4.098360655737705e-06,
"loss": 0.0908,
"step": 25
},
{
"epoch": 0.085667215815486,
"grad_norm": 1.7484734055235995,
"learning_rate": 4.2622950819672135e-06,
"loss": 0.1088,
"step": 26
},
{
"epoch": 0.08896210873146623,
"grad_norm": 1.7763409124600038,
"learning_rate": 4.426229508196722e-06,
"loss": 0.1264,
"step": 27
},
{
"epoch": 0.09225700164744646,
"grad_norm": 1.943799751354267,
"learning_rate": 4.59016393442623e-06,
"loss": 0.1204,
"step": 28
},
{
"epoch": 0.09555189456342669,
"grad_norm": 2.0024112586646723,
"learning_rate": 4.754098360655738e-06,
"loss": 0.132,
"step": 29
},
{
"epoch": 0.09884678747940692,
"grad_norm": 1.927718117469908,
"learning_rate": 4.918032786885246e-06,
"loss": 0.0988,
"step": 30
},
{
"epoch": 0.10214168039538715,
"grad_norm": 1.8175857487104838,
"learning_rate": 5.0819672131147545e-06,
"loss": 0.0997,
"step": 31
},
{
"epoch": 0.10543657331136738,
"grad_norm": 1.1821292523186457,
"learning_rate": 5.245901639344263e-06,
"loss": 0.0874,
"step": 32
},
{
"epoch": 0.10873146622734761,
"grad_norm": 1.6925074350020268,
"learning_rate": 5.409836065573772e-06,
"loss": 0.115,
"step": 33
},
{
"epoch": 0.11202635914332784,
"grad_norm": 1.4637368762611331,
"learning_rate": 5.573770491803278e-06,
"loss": 0.0916,
"step": 34
},
{
"epoch": 0.11532125205930807,
"grad_norm": 1.4814671153620174,
"learning_rate": 5.737704918032787e-06,
"loss": 0.0853,
"step": 35
},
{
"epoch": 0.1186161449752883,
"grad_norm": 1.243594463126339,
"learning_rate": 5.9016393442622956e-06,
"loss": 0.0903,
"step": 36
},
{
"epoch": 0.12191103789126853,
"grad_norm": 1.2713537957193175,
"learning_rate": 6.065573770491804e-06,
"loss": 0.1271,
"step": 37
},
{
"epoch": 0.12520593080724876,
"grad_norm": 2.2092835366807893,
"learning_rate": 6.229508196721312e-06,
"loss": 0.1137,
"step": 38
},
{
"epoch": 0.128500823723229,
"grad_norm": 1.1846569097593065,
"learning_rate": 6.393442622950821e-06,
"loss": 0.0714,
"step": 39
},
{
"epoch": 0.13179571663920922,
"grad_norm": 3.001321928490275,
"learning_rate": 6.5573770491803276e-06,
"loss": 0.1129,
"step": 40
},
{
"epoch": 0.13509060955518945,
"grad_norm": 1.8758843455974739,
"learning_rate": 6.721311475409837e-06,
"loss": 0.1021,
"step": 41
},
{
"epoch": 0.13838550247116968,
"grad_norm": 2.993102960488022,
"learning_rate": 6.885245901639345e-06,
"loss": 0.1155,
"step": 42
},
{
"epoch": 0.1416803953871499,
"grad_norm": 1.8441651337946723,
"learning_rate": 7.049180327868853e-06,
"loss": 0.1004,
"step": 43
},
{
"epoch": 0.14497528830313014,
"grad_norm": 1.495517808358825,
"learning_rate": 7.213114754098361e-06,
"loss": 0.0868,
"step": 44
},
{
"epoch": 0.14827018121911037,
"grad_norm": 1.8953155374136303,
"learning_rate": 7.3770491803278695e-06,
"loss": 0.1371,
"step": 45
},
{
"epoch": 0.1515650741350906,
"grad_norm": 1.702133714045992,
"learning_rate": 7.540983606557377e-06,
"loss": 0.1597,
"step": 46
},
{
"epoch": 0.15485996705107083,
"grad_norm": 1.0196937438072402,
"learning_rate": 7.704918032786886e-06,
"loss": 0.0892,
"step": 47
},
{
"epoch": 0.15815485996705106,
"grad_norm": 1.2171514922994324,
"learning_rate": 7.868852459016394e-06,
"loss": 0.0778,
"step": 48
},
{
"epoch": 0.1614497528830313,
"grad_norm": 1.0680772396455713,
"learning_rate": 8.032786885245902e-06,
"loss": 0.084,
"step": 49
},
{
"epoch": 0.16474464579901152,
"grad_norm": 2.9252478676320557,
"learning_rate": 8.19672131147541e-06,
"loss": 0.1275,
"step": 50
},
{
"epoch": 0.16803953871499178,
"grad_norm": 2.422192715088673,
"learning_rate": 8.360655737704919e-06,
"loss": 0.1473,
"step": 51
},
{
"epoch": 0.171334431630972,
"grad_norm": 1.8804479896519768,
"learning_rate": 8.524590163934427e-06,
"loss": 0.1036,
"step": 52
},
{
"epoch": 0.17462932454695224,
"grad_norm": 1.568191135105415,
"learning_rate": 8.688524590163935e-06,
"loss": 0.1109,
"step": 53
},
{
"epoch": 0.17792421746293247,
"grad_norm": 1.7822383034585774,
"learning_rate": 8.852459016393443e-06,
"loss": 0.1138,
"step": 54
},
{
"epoch": 0.1812191103789127,
"grad_norm": 2.39128847516323,
"learning_rate": 9.016393442622952e-06,
"loss": 0.0995,
"step": 55
},
{
"epoch": 0.18451400329489293,
"grad_norm": 2.005472201286874,
"learning_rate": 9.18032786885246e-06,
"loss": 0.1071,
"step": 56
},
{
"epoch": 0.18780889621087316,
"grad_norm": 1.771963789348757,
"learning_rate": 9.344262295081968e-06,
"loss": 0.1194,
"step": 57
},
{
"epoch": 0.19110378912685339,
"grad_norm": 1.9883587282426103,
"learning_rate": 9.508196721311476e-06,
"loss": 0.1263,
"step": 58
},
{
"epoch": 0.19439868204283361,
"grad_norm": 1.869337708289752,
"learning_rate": 9.672131147540984e-06,
"loss": 0.0906,
"step": 59
},
{
"epoch": 0.19769357495881384,
"grad_norm": 1.95284840649176,
"learning_rate": 9.836065573770493e-06,
"loss": 0.0941,
"step": 60
},
{
"epoch": 0.20098846787479407,
"grad_norm": 0.9408728760921459,
"learning_rate": 1e-05,
"loss": 0.0892,
"step": 61
},
{
"epoch": 0.2042833607907743,
"grad_norm": 1.4827130661563888,
"learning_rate": 9.999916929744365e-06,
"loss": 0.0897,
"step": 62
},
{
"epoch": 0.20757825370675453,
"grad_norm": 1.7548705917176584,
"learning_rate": 9.999667721737726e-06,
"loss": 0.1052,
"step": 63
},
{
"epoch": 0.21087314662273476,
"grad_norm": 1.5160957230364678,
"learning_rate": 9.999252384260794e-06,
"loss": 0.0959,
"step": 64
},
{
"epoch": 0.214168039538715,
"grad_norm": 1.4961421846399192,
"learning_rate": 9.998670931114443e-06,
"loss": 0.0738,
"step": 65
},
{
"epoch": 0.21746293245469522,
"grad_norm": 1.6241097546909007,
"learning_rate": 9.997923381619257e-06,
"loss": 0.1057,
"step": 66
},
{
"epoch": 0.22075782537067545,
"grad_norm": 1.3354496262353763,
"learning_rate": 9.99700976061489e-06,
"loss": 0.0951,
"step": 67
},
{
"epoch": 0.22405271828665568,
"grad_norm": 1.0847564491580965,
"learning_rate": 9.99593009845923e-06,
"loss": 0.0863,
"step": 68
},
{
"epoch": 0.2273476112026359,
"grad_norm": 1.6871231023520186,
"learning_rate": 9.994684431027407e-06,
"loss": 0.0804,
"step": 69
},
{
"epoch": 0.23064250411861614,
"grad_norm": 1.428280535176863,
"learning_rate": 9.99327279971058e-06,
"loss": 0.0865,
"step": 70
},
{
"epoch": 0.23393739703459637,
"grad_norm": 1.1179352637786575,
"learning_rate": 9.991695251414584e-06,
"loss": 0.0734,
"step": 71
},
{
"epoch": 0.2372322899505766,
"grad_norm": 1.2033588488413391,
"learning_rate": 9.989951838558352e-06,
"loss": 0.105,
"step": 72
},
{
"epoch": 0.24052718286655683,
"grad_norm": 1.465169988779563,
"learning_rate": 9.988042619072185e-06,
"loss": 0.0729,
"step": 73
},
{
"epoch": 0.24382207578253706,
"grad_norm": 0.8979589269956051,
"learning_rate": 9.985967656395823e-06,
"loss": 0.0802,
"step": 74
},
{
"epoch": 0.2471169686985173,
"grad_norm": 1.733084591549866,
"learning_rate": 9.98372701947634e-06,
"loss": 0.1105,
"step": 75
},
{
"epoch": 0.2504118616144975,
"grad_norm": 1.3264470296840676,
"learning_rate": 9.981320782765847e-06,
"loss": 0.0994,
"step": 76
},
{
"epoch": 0.25370675453047775,
"grad_norm": 1.6544247928428517,
"learning_rate": 9.978749026219023e-06,
"loss": 0.0729,
"step": 77
},
{
"epoch": 0.257001647446458,
"grad_norm": 1.7379037158314299,
"learning_rate": 9.976011835290457e-06,
"loss": 0.1084,
"step": 78
},
{
"epoch": 0.2602965403624382,
"grad_norm": 2.1898271902712465,
"learning_rate": 9.973109300931813e-06,
"loss": 0.14,
"step": 79
},
{
"epoch": 0.26359143327841844,
"grad_norm": 1.5180418313244188,
"learning_rate": 9.970041519588797e-06,
"loss": 0.1032,
"step": 80
},
{
"epoch": 0.26688632619439867,
"grad_norm": 0.9058107350669203,
"learning_rate": 9.966808593197959e-06,
"loss": 0.0659,
"step": 81
},
{
"epoch": 0.2701812191103789,
"grad_norm": 1.6740173453146032,
"learning_rate": 9.963410629183311e-06,
"loss": 0.0903,
"step": 82
},
{
"epoch": 0.27347611202635913,
"grad_norm": 1.6927879596763102,
"learning_rate": 9.959847740452746e-06,
"loss": 0.1011,
"step": 83
},
{
"epoch": 0.27677100494233936,
"grad_norm": 1.1858716785831125,
"learning_rate": 9.956120045394297e-06,
"loss": 0.1001,
"step": 84
},
{
"epoch": 0.2800658978583196,
"grad_norm": 1.7472793459338325,
"learning_rate": 9.952227667872197e-06,
"loss": 0.0999,
"step": 85
},
{
"epoch": 0.2833607907742998,
"grad_norm": 1.3355726739748197,
"learning_rate": 9.948170737222763e-06,
"loss": 0.0869,
"step": 86
},
{
"epoch": 0.28665568369028005,
"grad_norm": 1.6554862377061288,
"learning_rate": 9.943949388250102e-06,
"loss": 0.0956,
"step": 87
},
{
"epoch": 0.2899505766062603,
"grad_norm": 1.4959889750287383,
"learning_rate": 9.939563761221628e-06,
"loss": 0.1011,
"step": 88
},
{
"epoch": 0.2932454695222405,
"grad_norm": 1.90960683038771,
"learning_rate": 9.935014001863405e-06,
"loss": 0.086,
"step": 89
},
{
"epoch": 0.29654036243822074,
"grad_norm": 1.33889912510083,
"learning_rate": 9.930300261355305e-06,
"loss": 0.0884,
"step": 90
},
{
"epoch": 0.29983525535420097,
"grad_norm": 1.8364942381661888,
"learning_rate": 9.925422696325976e-06,
"loss": 0.1198,
"step": 91
},
{
"epoch": 0.3031301482701812,
"grad_norm": 0.9853906443215683,
"learning_rate": 9.920381468847648e-06,
"loss": 0.0805,
"step": 92
},
{
"epoch": 0.30642504118616143,
"grad_norm": 1.802022784884664,
"learning_rate": 9.915176746430746e-06,
"loss": 0.1,
"step": 93
},
{
"epoch": 0.30971993410214166,
"grad_norm": 1.6019896189177425,
"learning_rate": 9.909808702018315e-06,
"loss": 0.1063,
"step": 94
},
{
"epoch": 0.3130148270181219,
"grad_norm": 1.7270522226909808,
"learning_rate": 9.904277513980285e-06,
"loss": 0.1009,
"step": 95
},
{
"epoch": 0.3163097199341021,
"grad_norm": 1.355594119054743,
"learning_rate": 9.898583366107539e-06,
"loss": 0.0875,
"step": 96
},
{
"epoch": 0.31960461285008235,
"grad_norm": 1.8076099955595957,
"learning_rate": 9.892726447605803e-06,
"loss": 0.1236,
"step": 97
},
{
"epoch": 0.3228995057660626,
"grad_norm": 1.6099906623254512,
"learning_rate": 9.886706953089364e-06,
"loss": 0.0873,
"step": 98
},
{
"epoch": 0.3261943986820428,
"grad_norm": 1.1035445355215792,
"learning_rate": 9.880525082574604e-06,
"loss": 0.0869,
"step": 99
},
{
"epoch": 0.32948929159802304,
"grad_norm": 1.4670801213842743,
"learning_rate": 9.874181041473344e-06,
"loss": 0.1076,
"step": 100
},
{
"epoch": 0.33278418451400327,
"grad_norm": 1.1434779708887954,
"learning_rate": 9.867675040586035e-06,
"loss": 0.0987,
"step": 101
},
{
"epoch": 0.33607907742998355,
"grad_norm": 1.2329688966692105,
"learning_rate": 9.861007296094736e-06,
"loss": 0.0685,
"step": 102
},
{
"epoch": 0.3393739703459638,
"grad_norm": 1.33973829264815,
"learning_rate": 9.854178029555945e-06,
"loss": 0.0926,
"step": 103
},
{
"epoch": 0.342668863261944,
"grad_norm": 2.4442395017358565,
"learning_rate": 9.847187467893228e-06,
"loss": 0.0942,
"step": 104
},
{
"epoch": 0.34596375617792424,
"grad_norm": 1.5545572705565789,
"learning_rate": 9.840035843389684e-06,
"loss": 0.0849,
"step": 105
},
{
"epoch": 0.34925864909390447,
"grad_norm": 0.8361181058922856,
"learning_rate": 9.832723393680222e-06,
"loss": 0.0678,
"step": 106
},
{
"epoch": 0.3525535420098847,
"grad_norm": 1.1875767399598742,
"learning_rate": 9.825250361743667e-06,
"loss": 0.0922,
"step": 107
},
{
"epoch": 0.35584843492586493,
"grad_norm": 1.018855526493989,
"learning_rate": 9.817616995894694e-06,
"loss": 0.0893,
"step": 108
},
{
"epoch": 0.35914332784184516,
"grad_norm": 0.8793948272396893,
"learning_rate": 9.809823549775559e-06,
"loss": 0.0816,
"step": 109
},
{
"epoch": 0.3624382207578254,
"grad_norm": 1.1843731418180148,
"learning_rate": 9.801870282347686e-06,
"loss": 0.0815,
"step": 110
},
{
"epoch": 0.3657331136738056,
"grad_norm": 1.3513059654506845,
"learning_rate": 9.793757457883062e-06,
"loss": 0.0838,
"step": 111
},
{
"epoch": 0.36902800658978585,
"grad_norm": 0.9877986179117915,
"learning_rate": 9.785485345955446e-06,
"loss": 0.0873,
"step": 112
},
{
"epoch": 0.3723228995057661,
"grad_norm": 0.8970818631280407,
"learning_rate": 9.777054221431418e-06,
"loss": 0.0611,
"step": 113
},
{
"epoch": 0.3756177924217463,
"grad_norm": 0.9226544421763598,
"learning_rate": 9.768464364461248e-06,
"loss": 0.078,
"step": 114
},
{
"epoch": 0.37891268533772654,
"grad_norm": 1.4766754423858683,
"learning_rate": 9.75971606046958e-06,
"loss": 0.0858,
"step": 115
},
{
"epoch": 0.38220757825370677,
"grad_norm": 1.320312770948565,
"learning_rate": 9.750809600145955e-06,
"loss": 0.0872,
"step": 116
},
{
"epoch": 0.385502471169687,
"grad_norm": 0.9205622576015271,
"learning_rate": 9.741745279435144e-06,
"loss": 0.0855,
"step": 117
},
{
"epoch": 0.38879736408566723,
"grad_norm": 2.1130956623266384,
"learning_rate": 9.732523399527328e-06,
"loss": 0.0869,
"step": 118
},
{
"epoch": 0.39209225700164746,
"grad_norm": 1.346773242530782,
"learning_rate": 9.723144266848073e-06,
"loss": 0.0891,
"step": 119
},
{
"epoch": 0.3953871499176277,
"grad_norm": 0.954418724615336,
"learning_rate": 9.713608193048156e-06,
"loss": 0.0927,
"step": 120
},
{
"epoch": 0.3986820428336079,
"grad_norm": 1.1613672266850283,
"learning_rate": 9.703915494993215e-06,
"loss": 0.0946,
"step": 121
},
{
"epoch": 0.40197693574958815,
"grad_norm": 0.9668677562255413,
"learning_rate": 9.694066494753211e-06,
"loss": 0.0828,
"step": 122
},
{
"epoch": 0.4052718286655684,
"grad_norm": 1.1556457477946604,
"learning_rate": 9.684061519591734e-06,
"loss": 0.0926,
"step": 123
},
{
"epoch": 0.4085667215815486,
"grad_norm": 1.0348616873807939,
"learning_rate": 9.673900901955118e-06,
"loss": 0.0942,
"step": 124
},
{
"epoch": 0.41186161449752884,
"grad_norm": 0.954839683053947,
"learning_rate": 9.663584979461407e-06,
"loss": 0.0841,
"step": 125
},
{
"epoch": 0.41515650741350907,
"grad_norm": 0.8951987570639582,
"learning_rate": 9.653114094889128e-06,
"loss": 0.082,
"step": 126
},
{
"epoch": 0.4184514003294893,
"grad_norm": 0.5582851260265275,
"learning_rate": 9.642488596165903e-06,
"loss": 0.0579,
"step": 127
},
{
"epoch": 0.42174629324546953,
"grad_norm": 0.7282031291743011,
"learning_rate": 9.631708836356893e-06,
"loss": 0.0686,
"step": 128
},
{
"epoch": 0.42504118616144976,
"grad_norm": 0.7341380700784487,
"learning_rate": 9.620775173653055e-06,
"loss": 0.0581,
"step": 129
},
{
"epoch": 0.42833607907743,
"grad_norm": 1.0426311808706392,
"learning_rate": 9.609687971359254e-06,
"loss": 0.0863,
"step": 130
},
{
"epoch": 0.4316309719934102,
"grad_norm": 0.7221932362532579,
"learning_rate": 9.598447597882181e-06,
"loss": 0.0904,
"step": 131
},
{
"epoch": 0.43492586490939045,
"grad_norm": 1.057905146517346,
"learning_rate": 9.587054426718117e-06,
"loss": 0.087,
"step": 132
},
{
"epoch": 0.4382207578253707,
"grad_norm": 1.0654844742045295,
"learning_rate": 9.575508836440516e-06,
"loss": 0.0833,
"step": 133
},
{
"epoch": 0.4415156507413509,
"grad_norm": 0.9708392842496616,
"learning_rate": 9.563811210687433e-06,
"loss": 0.07,
"step": 134
},
{
"epoch": 0.44481054365733114,
"grad_norm": 0.869842645454385,
"learning_rate": 9.551961938148772e-06,
"loss": 0.0798,
"step": 135
},
{
"epoch": 0.44810543657331137,
"grad_norm": 0.9291669588809491,
"learning_rate": 9.539961412553375e-06,
"loss": 0.0717,
"step": 136
},
{
"epoch": 0.4514003294892916,
"grad_norm": 0.971502019921878,
"learning_rate": 9.52781003265593e-06,
"loss": 0.0936,
"step": 137
},
{
"epoch": 0.4546952224052718,
"grad_norm": 0.7797238947376094,
"learning_rate": 9.515508202223735e-06,
"loss": 0.0711,
"step": 138
},
{
"epoch": 0.45799011532125206,
"grad_norm": 1.0743726394776476,
"learning_rate": 9.503056330023267e-06,
"loss": 0.0755,
"step": 139
},
{
"epoch": 0.4612850082372323,
"grad_norm": 1.1954773475892864,
"learning_rate": 9.490454829806609e-06,
"loss": 0.1304,
"step": 140
},
{
"epoch": 0.4645799011532125,
"grad_norm": 0.5763939890921029,
"learning_rate": 9.477704120297698e-06,
"loss": 0.0614,
"step": 141
},
{
"epoch": 0.46787479406919275,
"grad_norm": 0.9472151466304063,
"learning_rate": 9.464804625178414e-06,
"loss": 0.0712,
"step": 142
},
{
"epoch": 0.471169686985173,
"grad_norm": 0.886844022382936,
"learning_rate": 9.4517567730745e-06,
"loss": 0.0797,
"step": 143
},
{
"epoch": 0.4744645799011532,
"grad_norm": 1.0134166748412918,
"learning_rate": 9.438560997541319e-06,
"loss": 0.0899,
"step": 144
},
{
"epoch": 0.47775947281713343,
"grad_norm": 0.8070690603660651,
"learning_rate": 9.425217737049452e-06,
"loss": 0.0826,
"step": 145
},
{
"epoch": 0.48105436573311366,
"grad_norm": 0.8619994488550511,
"learning_rate": 9.411727434970121e-06,
"loss": 0.086,
"step": 146
},
{
"epoch": 0.4843492586490939,
"grad_norm": 0.8778816857460797,
"learning_rate": 9.398090539560465e-06,
"loss": 0.0854,
"step": 147
},
{
"epoch": 0.4876441515650741,
"grad_norm": 1.1118672558694775,
"learning_rate": 9.384307503948637e-06,
"loss": 0.1105,
"step": 148
},
{
"epoch": 0.49093904448105435,
"grad_norm": 1.032735807082786,
"learning_rate": 9.370378786118755e-06,
"loss": 0.0783,
"step": 149
},
{
"epoch": 0.4942339373970346,
"grad_norm": 1.0495708384737894,
"learning_rate": 9.356304848895676e-06,
"loss": 0.0815,
"step": 150
},
{
"epoch": 0.4975288303130148,
"grad_norm": 1.0945068947939716,
"learning_rate": 9.342086159929629e-06,
"loss": 0.0875,
"step": 151
},
{
"epoch": 0.500823723228995,
"grad_norm": 1.1286751183737302,
"learning_rate": 9.327723191680666e-06,
"loss": 0.0545,
"step": 152
},
{
"epoch": 0.5041186161449753,
"grad_norm": 0.9183247855730471,
"learning_rate": 9.31321642140296e-06,
"loss": 0.0757,
"step": 153
},
{
"epoch": 0.5074135090609555,
"grad_norm": 1.0502035656993673,
"learning_rate": 9.29856633112896e-06,
"loss": 0.0809,
"step": 154
},
{
"epoch": 0.5107084019769358,
"grad_norm": 0.8738749178267629,
"learning_rate": 9.283773407653363e-06,
"loss": 0.0562,
"step": 155
},
{
"epoch": 0.514003294892916,
"grad_norm": 1.0813438666782247,
"learning_rate": 9.268838142516943e-06,
"loss": 0.085,
"step": 156
},
{
"epoch": 0.5172981878088962,
"grad_norm": 1.0115574109758954,
"learning_rate": 9.253761031990218e-06,
"loss": 0.0749,
"step": 157
},
{
"epoch": 0.5205930807248764,
"grad_norm": 1.375042365212545,
"learning_rate": 9.238542577056957e-06,
"loss": 0.078,
"step": 158
},
{
"epoch": 0.5238879736408567,
"grad_norm": 1.59130187611513,
"learning_rate": 9.223183283397538e-06,
"loss": 0.1029,
"step": 159
},
{
"epoch": 0.5271828665568369,
"grad_norm": 1.1395752837101527,
"learning_rate": 9.20768366137214e-06,
"loss": 0.1128,
"step": 160
},
{
"epoch": 0.5304777594728172,
"grad_norm": 0.8079597092244131,
"learning_rate": 9.19204422600379e-06,
"loss": 0.0527,
"step": 161
},
{
"epoch": 0.5337726523887973,
"grad_norm": 1.0336684369348528,
"learning_rate": 9.176265496961242e-06,
"loss": 0.0828,
"step": 162
},
{
"epoch": 0.5370675453047776,
"grad_norm": 0.8656528491730779,
"learning_rate": 9.160347998541722e-06,
"loss": 0.0704,
"step": 163
},
{
"epoch": 0.5403624382207578,
"grad_norm": 2.086614798853126,
"learning_rate": 9.144292259653493e-06,
"loss": 0.104,
"step": 164
},
{
"epoch": 0.5436573311367381,
"grad_norm": 1.5779000987891205,
"learning_rate": 9.128098813798291e-06,
"loss": 0.0996,
"step": 165
},
{
"epoch": 0.5469522240527183,
"grad_norm": 0.7283818376854391,
"learning_rate": 9.111768199053588e-06,
"loss": 0.0621,
"step": 166
},
{
"epoch": 0.5502471169686985,
"grad_norm": 1.2155979859224575,
"learning_rate": 9.095300958054722e-06,
"loss": 0.0653,
"step": 167
},
{
"epoch": 0.5535420098846787,
"grad_norm": 1.3669964434695867,
"learning_rate": 9.078697637976861e-06,
"loss": 0.1071,
"step": 168
},
{
"epoch": 0.556836902800659,
"grad_norm": 0.659337598200629,
"learning_rate": 9.061958790516821e-06,
"loss": 0.101,
"step": 169
},
{
"epoch": 0.5601317957166392,
"grad_norm": 3.064428601730586,
"learning_rate": 9.045084971874738e-06,
"loss": 0.0631,
"step": 170
},
{
"epoch": 0.5634266886326195,
"grad_norm": 1.8617437169334994,
"learning_rate": 9.028076742735583e-06,
"loss": 0.1062,
"step": 171
},
{
"epoch": 0.5667215815485996,
"grad_norm": 1.005975190266642,
"learning_rate": 9.010934668250533e-06,
"loss": 0.0706,
"step": 172
},
{
"epoch": 0.5700164744645799,
"grad_norm": 1.2704133524125742,
"learning_rate": 8.993659318018191e-06,
"loss": 0.1047,
"step": 173
},
{
"epoch": 0.5733113673805601,
"grad_norm": 1.5091840035688024,
"learning_rate": 8.976251266065663e-06,
"loss": 0.0915,
"step": 174
},
{
"epoch": 0.5766062602965404,
"grad_norm": 0.9983867645520093,
"learning_rate": 8.958711090829477e-06,
"loss": 0.0868,
"step": 175
},
{
"epoch": 0.5799011532125206,
"grad_norm": 0.9695191782522918,
"learning_rate": 8.94103937513637e-06,
"loss": 0.0782,
"step": 176
},
{
"epoch": 0.5831960461285008,
"grad_norm": 1.485821220772562,
"learning_rate": 8.923236706183923e-06,
"loss": 0.088,
"step": 177
},
{
"epoch": 0.586490939044481,
"grad_norm": 0.8778171297271887,
"learning_rate": 8.905303675521031e-06,
"loss": 0.0675,
"step": 178
},
{
"epoch": 0.5897858319604613,
"grad_norm": 1.4148551976101613,
"learning_rate": 8.887240879028276e-06,
"loss": 0.0968,
"step": 179
},
{
"epoch": 0.5930807248764415,
"grad_norm": 1.2313668213255908,
"learning_rate": 8.869048916898109e-06,
"loss": 0.0885,
"step": 180
},
{
"epoch": 0.5963756177924218,
"grad_norm": 0.7935552297095103,
"learning_rate": 8.850728393614903e-06,
"loss": 0.0919,
"step": 181
},
{
"epoch": 0.5996705107084019,
"grad_norm": 0.6219191194067725,
"learning_rate": 8.832279917934881e-06,
"loss": 0.0495,
"step": 182
},
{
"epoch": 0.6029654036243822,
"grad_norm": 0.891323405042387,
"learning_rate": 8.813704102865881e-06,
"loss": 0.1036,
"step": 183
},
{
"epoch": 0.6062602965403624,
"grad_norm": 0.7672391951678563,
"learning_rate": 8.795001565646983e-06,
"loss": 0.0728,
"step": 184
},
{
"epoch": 0.6095551894563427,
"grad_norm": 0.9915439152928615,
"learning_rate": 8.776172927728008e-06,
"loss": 0.0744,
"step": 185
},
{
"epoch": 0.6128500823723229,
"grad_norm": 0.7415850366120278,
"learning_rate": 8.75721881474886e-06,
"loss": 0.0999,
"step": 186
},
{
"epoch": 0.6161449752883031,
"grad_norm": 0.7803751476377428,
"learning_rate": 8.738139856518746e-06,
"loss": 0.084,
"step": 187
},
{
"epoch": 0.6194398682042833,
"grad_norm": 0.6444369328111929,
"learning_rate": 8.718936686995239e-06,
"loss": 0.0632,
"step": 188
},
{
"epoch": 0.6227347611202636,
"grad_norm": 1.063336491675898,
"learning_rate": 8.699609944263219e-06,
"loss": 0.0854,
"step": 189
},
{
"epoch": 0.6260296540362438,
"grad_norm": 0.5254825539939502,
"learning_rate": 8.680160270513671e-06,
"loss": 0.0658,
"step": 190
},
{
"epoch": 0.6293245469522241,
"grad_norm": 2.1957036152343097,
"learning_rate": 8.660588312022345e-06,
"loss": 0.0767,
"step": 191
},
{
"epoch": 0.6326194398682042,
"grad_norm": 1.1425043272398887,
"learning_rate": 8.640894719128274e-06,
"loss": 0.1092,
"step": 192
},
{
"epoch": 0.6359143327841845,
"grad_norm": 0.6811077233527263,
"learning_rate": 8.621080146212181e-06,
"loss": 0.0552,
"step": 193
},
{
"epoch": 0.6392092257001647,
"grad_norm": 1.0217179933628129,
"learning_rate": 8.601145251674718e-06,
"loss": 0.0749,
"step": 194
},
{
"epoch": 0.642504118616145,
"grad_norm": 0.9919309113126284,
"learning_rate": 8.581090697914602e-06,
"loss": 0.0929,
"step": 195
},
{
"epoch": 0.6457990115321252,
"grad_norm": 1.178486792691301,
"learning_rate": 8.560917151306594e-06,
"loss": 0.1023,
"step": 196
},
{
"epoch": 0.6490939044481054,
"grad_norm": 1.155779967707836,
"learning_rate": 8.540625282179364e-06,
"loss": 0.0821,
"step": 197
},
{
"epoch": 0.6523887973640856,
"grad_norm": 1.0278193202776953,
"learning_rate": 8.520215764793214e-06,
"loss": 0.0739,
"step": 198
},
{
"epoch": 0.6556836902800659,
"grad_norm": 5.872968813903652,
"learning_rate": 8.499689277317675e-06,
"loss": 0.0763,
"step": 199
},
{
"epoch": 0.6589785831960461,
"grad_norm": 1.4949053388183082,
"learning_rate": 8.479046501808971e-06,
"loss": 0.0696,
"step": 200
},
{
"epoch": 0.6622734761120264,
"grad_norm": 0.9342591652212858,
"learning_rate": 8.45828812418736e-06,
"loss": 0.0629,
"step": 201
},
{
"epoch": 0.6655683690280065,
"grad_norm": 0.5142904539906094,
"learning_rate": 8.437414834214333e-06,
"loss": 0.0653,
"step": 202
},
{
"epoch": 0.6688632619439868,
"grad_norm": 1.3243894037794444,
"learning_rate": 8.416427325469705e-06,
"loss": 0.1095,
"step": 203
},
{
"epoch": 0.6721581548599671,
"grad_norm": 1.3555237832265656,
"learning_rate": 8.395326295328562e-06,
"loss": 0.1028,
"step": 204
},
{
"epoch": 0.6754530477759473,
"grad_norm": 0.818855723195901,
"learning_rate": 8.374112444938094e-06,
"loss": 0.088,
"step": 205
},
{
"epoch": 0.6787479406919276,
"grad_norm": 0.8852074571699485,
"learning_rate": 8.352786479194288e-06,
"loss": 0.0526,
"step": 206
},
{
"epoch": 0.6820428336079077,
"grad_norm": 1.372137747701387,
"learning_rate": 8.331349106718515e-06,
"loss": 0.0957,
"step": 207
},
{
"epoch": 0.685337726523888,
"grad_norm": 0.8727917319312107,
"learning_rate": 8.309801039833978e-06,
"loss": 0.0895,
"step": 208
},
{
"epoch": 0.6886326194398682,
"grad_norm": 1.0509761315988109,
"learning_rate": 8.28814299454205e-06,
"loss": 0.0996,
"step": 209
},
{
"epoch": 0.6919275123558485,
"grad_norm": 0.8839477803791086,
"learning_rate": 8.266375690498475e-06,
"loss": 0.0865,
"step": 210
},
{
"epoch": 0.6952224052718287,
"grad_norm": 0.7064553470448757,
"learning_rate": 8.244499850989453e-06,
"loss": 0.0728,
"step": 211
},
{
"epoch": 0.6985172981878089,
"grad_norm": 0.9694201547278781,
"learning_rate": 8.22251620290762e-06,
"loss": 0.0581,
"step": 212
},
{
"epoch": 0.7018121911037891,
"grad_norm": 0.724783690835583,
"learning_rate": 8.20042547672788e-06,
"loss": 0.0828,
"step": 213
},
{
"epoch": 0.7051070840197694,
"grad_norm": 0.7137797204098316,
"learning_rate": 8.178228406483145e-06,
"loss": 0.0707,
"step": 214
},
{
"epoch": 0.7084019769357496,
"grad_norm": 0.4606523745916078,
"learning_rate": 8.15592572973993e-06,
"loss": 0.044,
"step": 215
},
{
"epoch": 0.7116968698517299,
"grad_norm": 0.6640092591585782,
"learning_rate": 8.133518187573864e-06,
"loss": 0.0561,
"step": 216
},
{
"epoch": 0.71499176276771,
"grad_norm": 0.8170937626114954,
"learning_rate": 8.111006524545043e-06,
"loss": 0.0823,
"step": 217
},
{
"epoch": 0.7182866556836903,
"grad_norm": 0.6372182616249538,
"learning_rate": 8.088391488673313e-06,
"loss": 0.066,
"step": 218
},
{
"epoch": 0.7215815485996705,
"grad_norm": 0.6375067936631406,
"learning_rate": 8.065673831413396e-06,
"loss": 0.0506,
"step": 219
},
{
"epoch": 0.7248764415156508,
"grad_norm": 0.6423309999365212,
"learning_rate": 8.042854307629932e-06,
"loss": 0.0629,
"step": 220
},
{
"epoch": 0.728171334431631,
"grad_norm": 0.752582677343103,
"learning_rate": 8.019933675572389e-06,
"loss": 0.0722,
"step": 221
},
{
"epoch": 0.7314662273476112,
"grad_norm": 0.8112402585396246,
"learning_rate": 7.996912696849873e-06,
"loss": 0.0842,
"step": 222
},
{
"epoch": 0.7347611202635914,
"grad_norm": 0.6919487336036387,
"learning_rate": 7.97379213640582e-06,
"loss": 0.0684,
"step": 223
},
{
"epoch": 0.7380560131795717,
"grad_norm": 0.7434807539917022,
"learning_rate": 7.950572762492577e-06,
"loss": 0.0682,
"step": 224
},
{
"epoch": 0.7413509060955519,
"grad_norm": 0.6848656249105235,
"learning_rate": 7.927255346645872e-06,
"loss": 0.0546,
"step": 225
},
{
"epoch": 0.7446457990115322,
"grad_norm": 0.7826479624770332,
"learning_rate": 7.903840663659186e-06,
"loss": 0.0684,
"step": 226
},
{
"epoch": 0.7479406919275123,
"grad_norm": 0.6927518800734283,
"learning_rate": 7.880329491557996e-06,
"loss": 0.079,
"step": 227
},
{
"epoch": 0.7512355848434926,
"grad_norm": 0.8763045243203113,
"learning_rate": 7.856722611573938e-06,
"loss": 0.1068,
"step": 228
},
{
"epoch": 0.7545304777594728,
"grad_norm": 0.8300681217056403,
"learning_rate": 7.83302080811883e-06,
"loss": 0.0667,
"step": 229
},
{
"epoch": 0.7578253706754531,
"grad_norm": 0.5437395859594083,
"learning_rate": 7.809224868758621e-06,
"loss": 0.0671,
"step": 230
},
{
"epoch": 0.7611202635914333,
"grad_norm": 0.7134167417475868,
"learning_rate": 7.78533558418722e-06,
"loss": 0.079,
"step": 231
},
{
"epoch": 0.7644151565074135,
"grad_norm": 0.8367162252369527,
"learning_rate": 7.761353748200213e-06,
"loss": 0.075,
"step": 232
},
{
"epoch": 0.7677100494233937,
"grad_norm": 0.6993381735068975,
"learning_rate": 7.737280157668503e-06,
"loss": 0.0665,
"step": 233
},
{
"epoch": 0.771004942339374,
"grad_norm": 0.644489745443266,
"learning_rate": 7.713115612511815e-06,
"loss": 0.0704,
"step": 234
},
{
"epoch": 0.7742998352553542,
"grad_norm": 0.6337392482963783,
"learning_rate": 7.688860915672129e-06,
"loss": 0.0487,
"step": 235
},
{
"epoch": 0.7775947281713345,
"grad_norm": 0.4306575759208823,
"learning_rate": 7.664516873086987e-06,
"loss": 0.0498,
"step": 236
},
{
"epoch": 0.7808896210873146,
"grad_norm": 0.6371209076121114,
"learning_rate": 7.640084293662731e-06,
"loss": 0.0581,
"step": 237
},
{
"epoch": 0.7841845140032949,
"grad_norm": 0.809205628205596,
"learning_rate": 7.615563989247604e-06,
"loss": 0.0886,
"step": 238
},
{
"epoch": 0.7874794069192751,
"grad_norm": 0.6807826450879982,
"learning_rate": 7.590956774604791e-06,
"loss": 0.0824,
"step": 239
},
{
"epoch": 0.7907742998352554,
"grad_norm": 0.9092838195300236,
"learning_rate": 7.566263467385335e-06,
"loss": 0.0703,
"step": 240
},
{
"epoch": 0.7940691927512356,
"grad_norm": 0.736565350841279,
"learning_rate": 7.541484888100974e-06,
"loss": 0.0695,
"step": 241
},
{
"epoch": 0.7973640856672158,
"grad_norm": 0.7220288466907268,
"learning_rate": 7.516621860096873e-06,
"loss": 0.0707,
"step": 242
},
{
"epoch": 0.800658978583196,
"grad_norm": 0.6829838831547227,
"learning_rate": 7.491675209524272e-06,
"loss": 0.0666,
"step": 243
},
{
"epoch": 0.8039538714991763,
"grad_norm": 0.8226949141177504,
"learning_rate": 7.466645765313023e-06,
"loss": 0.0752,
"step": 244
},
{
"epoch": 0.8072487644151565,
"grad_norm": 0.5909405519820083,
"learning_rate": 7.4415343591440604e-06,
"loss": 0.0582,
"step": 245
},
{
"epoch": 0.8105436573311368,
"grad_norm": 0.7318765147109815,
"learning_rate": 7.416341825421755e-06,
"loss": 0.078,
"step": 246
},
{
"epoch": 0.8138385502471169,
"grad_norm": 0.7063912838195767,
"learning_rate": 7.391069001246193e-06,
"loss": 0.0868,
"step": 247
},
{
"epoch": 0.8171334431630972,
"grad_norm": 0.6799477779267012,
"learning_rate": 7.365716726385361e-06,
"loss": 0.0681,
"step": 248
},
{
"epoch": 0.8204283360790774,
"grad_norm": 0.8516971338664023,
"learning_rate": 7.3402858432472416e-06,
"loss": 0.0761,
"step": 249
},
{
"epoch": 0.8237232289950577,
"grad_norm": 0.8051104503646311,
"learning_rate": 7.3147771968518175e-06,
"loss": 0.077,
"step": 250
},
{
"epoch": 0.8270181219110379,
"grad_norm": 0.8417638265928152,
"learning_rate": 7.289191634803002e-06,
"loss": 0.0721,
"step": 251
},
{
"epoch": 0.8303130148270181,
"grad_norm": 0.9280576906426667,
"learning_rate": 7.263530007260466e-06,
"loss": 0.0839,
"step": 252
},
{
"epoch": 0.8336079077429983,
"grad_norm": 0.8205604877193189,
"learning_rate": 7.2377931669113934e-06,
"loss": 0.084,
"step": 253
},
{
"epoch": 0.8369028006589786,
"grad_norm": 0.7347246169190605,
"learning_rate": 7.211981968942147e-06,
"loss": 0.0508,
"step": 254
},
{
"epoch": 0.8401976935749588,
"grad_norm": 0.7727540137134915,
"learning_rate": 7.186097271009852e-06,
"loss": 0.0504,
"step": 255
},
{
"epoch": 0.8434925864909391,
"grad_norm": 0.6116838823458901,
"learning_rate": 7.160139933213899e-06,
"loss": 0.0533,
"step": 256
},
{
"epoch": 0.8467874794069192,
"grad_norm": 0.8518782068127816,
"learning_rate": 7.134110818067361e-06,
"loss": 0.0775,
"step": 257
},
{
"epoch": 0.8500823723228995,
"grad_norm": 0.9449160515812749,
"learning_rate": 7.1080107904683405e-06,
"loss": 0.0721,
"step": 258
},
{
"epoch": 0.8533772652388797,
"grad_norm": 0.6964873142430633,
"learning_rate": 7.08184071767122e-06,
"loss": 0.0673,
"step": 259
},
{
"epoch": 0.85667215815486,
"grad_norm": 0.768104304709271,
"learning_rate": 7.0556014692578554e-06,
"loss": 0.0749,
"step": 260
},
{
"epoch": 0.8599670510708401,
"grad_norm": 0.7599189113700034,
"learning_rate": 7.029293917108678e-06,
"loss": 0.0684,
"step": 261
},
{
"epoch": 0.8632619439868204,
"grad_norm": 0.777387517223909,
"learning_rate": 7.0029189353737195e-06,
"loss": 0.0656,
"step": 262
},
{
"epoch": 0.8665568369028006,
"grad_norm": 0.7045793540209936,
"learning_rate": 6.9764774004435685e-06,
"loss": 0.0619,
"step": 263
},
{
"epoch": 0.8698517298187809,
"grad_norm": 0.6234760268316166,
"learning_rate": 6.949970190920255e-06,
"loss": 0.0708,
"step": 264
},
{
"epoch": 0.8731466227347611,
"grad_norm": 0.7124980322892176,
"learning_rate": 6.9233981875880416e-06,
"loss": 0.0521,
"step": 265
},
{
"epoch": 0.8764415156507414,
"grad_norm": 0.8490902000387839,
"learning_rate": 6.896762273384179e-06,
"loss": 0.0632,
"step": 266
},
{
"epoch": 0.8797364085667215,
"grad_norm": 0.6944201065528963,
"learning_rate": 6.870063333369543e-06,
"loss": 0.0716,
"step": 267
},
{
"epoch": 0.8830313014827018,
"grad_norm": 0.758349126043532,
"learning_rate": 6.8433022546992444e-06,
"loss": 0.0596,
"step": 268
},
{
"epoch": 0.886326194398682,
"grad_norm": 1.2664444257431744,
"learning_rate": 6.81647992659314e-06,
"loss": 0.0628,
"step": 269
},
{
"epoch": 0.8896210873146623,
"grad_norm": 0.8379324844684077,
"learning_rate": 6.789597240306295e-06,
"loss": 0.0674,
"step": 270
},
{
"epoch": 0.8929159802306426,
"grad_norm": 0.8462600835900949,
"learning_rate": 6.762655089099353e-06,
"loss": 0.0659,
"step": 271
},
{
"epoch": 0.8962108731466227,
"grad_norm": 0.9094387161179498,
"learning_rate": 6.735654368208875e-06,
"loss": 0.0623,
"step": 272
},
{
"epoch": 0.899505766062603,
"grad_norm": 0.7877875224066865,
"learning_rate": 6.7085959748175685e-06,
"loss": 0.0696,
"step": 273
},
{
"epoch": 0.9028006589785832,
"grad_norm": 0.6514864513423558,
"learning_rate": 6.681480808024503e-06,
"loss": 0.0766,
"step": 274
},
{
"epoch": 0.9060955518945635,
"grad_norm": 1.148236164352365,
"learning_rate": 6.654309768815208e-06,
"loss": 0.0903,
"step": 275
},
{
"epoch": 0.9093904448105437,
"grad_norm": 0.7078109102899715,
"learning_rate": 6.627083760031755e-06,
"loss": 0.0607,
"step": 276
},
{
"epoch": 0.9126853377265239,
"grad_norm": 0.613094345393223,
"learning_rate": 6.599803686342748e-06,
"loss": 0.0655,
"step": 277
},
{
"epoch": 0.9159802306425041,
"grad_norm": 0.6642339763695972,
"learning_rate": 6.572470454213266e-06,
"loss": 0.0731,
"step": 278
},
{
"epoch": 0.9192751235584844,
"grad_norm": 0.6971630112819691,
"learning_rate": 6.545084971874738e-06,
"loss": 0.0473,
"step": 279
},
{
"epoch": 0.9225700164744646,
"grad_norm": 0.7592858638911076,
"learning_rate": 6.517648149294774e-06,
"loss": 0.0581,
"step": 280
},
{
"epoch": 0.9258649093904449,
"grad_norm": 0.7189143571066544,
"learning_rate": 6.490160898146919e-06,
"loss": 0.0733,
"step": 281
},
{
"epoch": 0.929159802306425,
"grad_norm": 0.8305599945381572,
"learning_rate": 6.4626241317803665e-06,
"loss": 0.0807,
"step": 282
},
{
"epoch": 0.9324546952224053,
"grad_norm": 0.8787944618632045,
"learning_rate": 6.4350387651896025e-06,
"loss": 0.0648,
"step": 283
},
{
"epoch": 0.9357495881383855,
"grad_norm": 0.649270561331511,
"learning_rate": 6.407405714984011e-06,
"loss": 0.0921,
"step": 284
},
{
"epoch": 0.9390444810543658,
"grad_norm": 0.9873611661857511,
"learning_rate": 6.379725899357408e-06,
"loss": 0.0847,
"step": 285
},
{
"epoch": 0.942339373970346,
"grad_norm": 0.8338719043181901,
"learning_rate": 6.3520002380575395e-06,
"loss": 0.0673,
"step": 286
},
{
"epoch": 0.9456342668863262,
"grad_norm": 0.8390156519820746,
"learning_rate": 6.324229652355513e-06,
"loss": 0.0626,
"step": 287
},
{
"epoch": 0.9489291598023064,
"grad_norm": 0.7197773939188823,
"learning_rate": 6.29641506501519e-06,
"loss": 0.0864,
"step": 288
},
{
"epoch": 0.9522240527182867,
"grad_norm": 0.942984980454084,
"learning_rate": 6.2685574002625235e-06,
"loss": 0.0686,
"step": 289
},
{
"epoch": 0.9555189456342669,
"grad_norm": 0.9649936636393807,
"learning_rate": 6.2406575837548455e-06,
"loss": 0.0599,
"step": 290
},
{
"epoch": 0.9588138385502472,
"grad_norm": 0.6889881534410974,
"learning_rate": 6.212716542550112e-06,
"loss": 0.101,
"step": 291
},
{
"epoch": 0.9621087314662273,
"grad_norm": 0.9632795509211302,
"learning_rate": 6.184735205076097e-06,
"loss": 0.0773,
"step": 292
},
{
"epoch": 0.9654036243822076,
"grad_norm": 1.0400767819370376,
"learning_rate": 6.156714501099544e-06,
"loss": 0.0638,
"step": 293
},
{
"epoch": 0.9686985172981878,
"grad_norm": 1.0147243725605253,
"learning_rate": 6.1286553616952705e-06,
"loss": 0.0593,
"step": 294
},
{
"epoch": 0.9719934102141681,
"grad_norm": 0.6613193470791487,
"learning_rate": 6.100558719215228e-06,
"loss": 0.0632,
"step": 295
},
{
"epoch": 0.9752883031301482,
"grad_norm": 1.0408938474730054,
"learning_rate": 6.072425507257528e-06,
"loss": 0.0876,
"step": 296
},
{
"epoch": 0.9785831960461285,
"grad_norm": 0.712701647042842,
"learning_rate": 6.044256660635412e-06,
"loss": 0.0733,
"step": 297
},
{
"epoch": 0.9818780889621087,
"grad_norm": 0.6397491114376809,
"learning_rate": 6.016053115346197e-06,
"loss": 0.0561,
"step": 298
},
{
"epoch": 0.985172981878089,
"grad_norm": 0.7191102659386986,
"learning_rate": 5.987815808540169e-06,
"loss": 0.0791,
"step": 299
},
{
"epoch": 0.9884678747940692,
"grad_norm": 0.4709712102337363,
"learning_rate": 5.959545678489447e-06,
"loss": 0.0475,
"step": 300
},
{
"epoch": 0.9917627677100495,
"grad_norm": 0.8715274588578796,
"learning_rate": 5.931243664556803e-06,
"loss": 0.0771,
"step": 301
},
{
"epoch": 0.9950576606260296,
"grad_norm": 0.7017524340447387,
"learning_rate": 5.902910707164449e-06,
"loss": 0.0712,
"step": 302
},
{
"epoch": 0.9983525535420099,
"grad_norm": 0.7619744594259967,
"learning_rate": 5.874547747762792e-06,
"loss": 0.0585,
"step": 303
},
{
"epoch": 0.9983525535420099,
"eval_loss": 0.07007648050785065,
"eval_runtime": 143.0638,
"eval_samples_per_second": 35.683,
"eval_steps_per_second": 1.118,
"step": 303
},
{
"epoch": 1.00164744645799,
"grad_norm": 0.5917774055195716,
"learning_rate": 5.8461557287991455e-06,
"loss": 0.0686,
"step": 304
},
{
"epoch": 1.0049423393739703,
"grad_norm": 0.4973762275932349,
"learning_rate": 5.81773559368642e-06,
"loss": 0.0524,
"step": 305
},
{
"epoch": 1.0082372322899507,
"grad_norm": 0.5021975231329254,
"learning_rate": 5.7892882867717705e-06,
"loss": 0.0577,
"step": 306
},
{
"epoch": 1.0115321252059308,
"grad_norm": 0.704352626743678,
"learning_rate": 5.7608147533052194e-06,
"loss": 0.0509,
"step": 307
},
{
"epoch": 1.014827018121911,
"grad_norm": 1.189723828759097,
"learning_rate": 5.732315939408251e-06,
"loss": 0.0815,
"step": 308
},
{
"epoch": 1.0181219110378912,
"grad_norm": 0.6036027009145574,
"learning_rate": 5.703792792042363e-06,
"loss": 0.0556,
"step": 309
},
{
"epoch": 1.0214168039538716,
"grad_norm": 0.5342904909103813,
"learning_rate": 5.675246258977617e-06,
"loss": 0.0487,
"step": 310
},
{
"epoch": 1.0247116968698518,
"grad_norm": 0.46763620767148034,
"learning_rate": 5.646677288761132e-06,
"loss": 0.0491,
"step": 311
},
{
"epoch": 1.028006589785832,
"grad_norm": 0.5696375911949768,
"learning_rate": 5.618086830685569e-06,
"loss": 0.047,
"step": 312
},
{
"epoch": 1.031301482701812,
"grad_norm": 0.38600791899244996,
"learning_rate": 5.589475834757595e-06,
"loss": 0.032,
"step": 313
},
{
"epoch": 1.0345963756177925,
"grad_norm": 1.0072710877393638,
"learning_rate": 5.560845251666307e-06,
"loss": 0.063,
"step": 314
},
{
"epoch": 1.0378912685337727,
"grad_norm": 0.663725882779124,
"learning_rate": 5.532196032751647e-06,
"loss": 0.0563,
"step": 315
},
{
"epoch": 1.0411861614497528,
"grad_norm": 0.6135177621912624,
"learning_rate": 5.503529129972792e-06,
"loss": 0.0514,
"step": 316
},
{
"epoch": 1.044481054365733,
"grad_norm": 0.7549455934476204,
"learning_rate": 5.474845495876518e-06,
"loss": 0.0563,
"step": 317
},
{
"epoch": 1.0477759472817134,
"grad_norm": 0.8244910748727189,
"learning_rate": 5.4461460835655535e-06,
"loss": 0.0804,
"step": 318
},
{
"epoch": 1.0510708401976936,
"grad_norm": 0.604488079236042,
"learning_rate": 5.417431846666903e-06,
"loss": 0.0679,
"step": 319
},
{
"epoch": 1.0543657331136738,
"grad_norm": 0.5136265587955748,
"learning_rate": 5.388703739300167e-06,
"loss": 0.0388,
"step": 320
},
{
"epoch": 1.057660626029654,
"grad_norm": 0.6007478171198604,
"learning_rate": 5.359962716045836e-06,
"loss": 0.0632,
"step": 321
},
{
"epoch": 1.0609555189456343,
"grad_norm": 0.4928892879154173,
"learning_rate": 5.331209731913568e-06,
"loss": 0.058,
"step": 322
},
{
"epoch": 1.0642504118616145,
"grad_norm": 0.5300520318408385,
"learning_rate": 5.30244574231046e-06,
"loss": 0.0528,
"step": 323
},
{
"epoch": 1.0675453047775947,
"grad_norm": 0.49159187140329286,
"learning_rate": 5.273671703009301e-06,
"loss": 0.046,
"step": 324
},
{
"epoch": 1.0708401976935749,
"grad_norm": 0.804620211006138,
"learning_rate": 5.2448885701168094e-06,
"loss": 0.0601,
"step": 325
},
{
"epoch": 1.0741350906095553,
"grad_norm": 0.4648011852930538,
"learning_rate": 5.21609730004187e-06,
"loss": 0.0438,
"step": 326
},
{
"epoch": 1.0774299835255354,
"grad_norm": 0.5362596735899865,
"learning_rate": 5.187298849463748e-06,
"loss": 0.0507,
"step": 327
},
{
"epoch": 1.0807248764415156,
"grad_norm": 0.5443586783585722,
"learning_rate": 5.158494175300304e-06,
"loss": 0.053,
"step": 328
},
{
"epoch": 1.084019769357496,
"grad_norm": 0.6076056192307563,
"learning_rate": 5.129684234676195e-06,
"loss": 0.0594,
"step": 329
},
{
"epoch": 1.0873146622734762,
"grad_norm": 0.9033252357763137,
"learning_rate": 5.100869984891077e-06,
"loss": 0.06,
"step": 330
},
{
"epoch": 1.0906095551894563,
"grad_norm": 0.454480847306655,
"learning_rate": 5.072052383387787e-06,
"loss": 0.0424,
"step": 331
},
{
"epoch": 1.0939044481054365,
"grad_norm": 0.46517988927206794,
"learning_rate": 5.043232387720532e-06,
"loss": 0.0443,
"step": 332
},
{
"epoch": 1.0971993410214167,
"grad_norm": 0.4148720401510593,
"learning_rate": 5.014410955523079e-06,
"loss": 0.0387,
"step": 333
},
{
"epoch": 1.100494233937397,
"grad_norm": 0.5146539821704307,
"learning_rate": 4.9855890444769226e-06,
"loss": 0.0563,
"step": 334
},
{
"epoch": 1.1037891268533773,
"grad_norm": 0.5267211782218569,
"learning_rate": 4.956767612279468e-06,
"loss": 0.044,
"step": 335
},
{
"epoch": 1.1070840197693574,
"grad_norm": 0.5731696810590752,
"learning_rate": 4.927947616612216e-06,
"loss": 0.0469,
"step": 336
},
{
"epoch": 1.1103789126853378,
"grad_norm": 0.4606767989043497,
"learning_rate": 4.899130015108923e-06,
"loss": 0.0556,
"step": 337
},
{
"epoch": 1.113673805601318,
"grad_norm": 0.5591348812226693,
"learning_rate": 4.8703157653238055e-06,
"loss": 0.0526,
"step": 338
},
{
"epoch": 1.1169686985172982,
"grad_norm": 0.5103079438074868,
"learning_rate": 4.841505824699697e-06,
"loss": 0.0651,
"step": 339
},
{
"epoch": 1.1202635914332784,
"grad_norm": 0.6163138349681117,
"learning_rate": 4.812701150536254e-06,
"loss": 0.0509,
"step": 340
},
{
"epoch": 1.1235584843492585,
"grad_norm": 0.4842115475256147,
"learning_rate": 4.78390269995813e-06,
"loss": 0.035,
"step": 341
},
{
"epoch": 1.126853377265239,
"grad_norm": 0.4047877822645327,
"learning_rate": 4.755111429883191e-06,
"loss": 0.0342,
"step": 342
},
{
"epoch": 1.130148270181219,
"grad_norm": 0.5782935405242332,
"learning_rate": 4.726328296990699e-06,
"loss": 0.0416,
"step": 343
},
{
"epoch": 1.1334431630971993,
"grad_norm": 0.5846524401590787,
"learning_rate": 4.697554257689541e-06,
"loss": 0.0419,
"step": 344
},
{
"epoch": 1.1367380560131797,
"grad_norm": 0.5096985328650335,
"learning_rate": 4.668790268086432e-06,
"loss": 0.044,
"step": 345
},
{
"epoch": 1.1400329489291599,
"grad_norm": 0.5796683420196656,
"learning_rate": 4.640037283954165e-06,
"loss": 0.0634,
"step": 346
},
{
"epoch": 1.14332784184514,
"grad_norm": 0.5897186824110954,
"learning_rate": 4.611296260699833e-06,
"loss": 0.0511,
"step": 347
},
{
"epoch": 1.1466227347611202,
"grad_norm": 0.5043407904517478,
"learning_rate": 4.582568153333098e-06,
"loss": 0.0474,
"step": 348
},
{
"epoch": 1.1499176276771004,
"grad_norm": 0.49203813884361564,
"learning_rate": 4.553853916434448e-06,
"loss": 0.0399,
"step": 349
},
{
"epoch": 1.1532125205930808,
"grad_norm": 0.6380533145833258,
"learning_rate": 4.525154504123483e-06,
"loss": 0.0628,
"step": 350
},
{
"epoch": 1.156507413509061,
"grad_norm": 0.6307435685302706,
"learning_rate": 4.496470870027209e-06,
"loss": 0.0544,
"step": 351
},
{
"epoch": 1.1598023064250411,
"grad_norm": 0.58051661483701,
"learning_rate": 4.467803967248354e-06,
"loss": 0.0549,
"step": 352
},
{
"epoch": 1.1630971993410215,
"grad_norm": 0.45506940053593953,
"learning_rate": 4.439154748333695e-06,
"loss": 0.0455,
"step": 353
},
{
"epoch": 1.1663920922570017,
"grad_norm": 0.4477960561383021,
"learning_rate": 4.410524165242407e-06,
"loss": 0.0417,
"step": 354
},
{
"epoch": 1.1696869851729819,
"grad_norm": 0.5024790404868378,
"learning_rate": 4.381913169314432e-06,
"loss": 0.0483,
"step": 355
},
{
"epoch": 1.172981878088962,
"grad_norm": 0.43352510094853813,
"learning_rate": 4.3533227112388694e-06,
"loss": 0.0381,
"step": 356
},
{
"epoch": 1.1762767710049424,
"grad_norm": 0.8015757322992388,
"learning_rate": 4.324753741022383e-06,
"loss": 0.0589,
"step": 357
},
{
"epoch": 1.1795716639209226,
"grad_norm": 0.554923192898479,
"learning_rate": 4.296207207957638e-06,
"loss": 0.0469,
"step": 358
},
{
"epoch": 1.1828665568369028,
"grad_norm": 0.4540612599730088,
"learning_rate": 4.26768406059175e-06,
"loss": 0.0469,
"step": 359
},
{
"epoch": 1.186161449752883,
"grad_norm": 0.4977016265485015,
"learning_rate": 4.239185246694781e-06,
"loss": 0.0486,
"step": 360
},
{
"epoch": 1.1894563426688634,
"grad_norm": 0.5773178206107633,
"learning_rate": 4.21071171322823e-06,
"loss": 0.0588,
"step": 361
},
{
"epoch": 1.1927512355848435,
"grad_norm": 0.5714806332591411,
"learning_rate": 4.182264406313582e-06,
"loss": 0.0473,
"step": 362
},
{
"epoch": 1.1960461285008237,
"grad_norm": 0.5399317568380463,
"learning_rate": 4.1538442712008545e-06,
"loss": 0.0515,
"step": 363
},
{
"epoch": 1.1993410214168039,
"grad_norm": 0.5077736606662918,
"learning_rate": 4.12545225223721e-06,
"loss": 0.0473,
"step": 364
},
{
"epoch": 1.2026359143327843,
"grad_norm": 0.65833510309246,
"learning_rate": 4.097089292835551e-06,
"loss": 0.0574,
"step": 365
},
{
"epoch": 1.2059308072487644,
"grad_norm": 0.5750314764693017,
"learning_rate": 4.0687563354431986e-06,
"loss": 0.033,
"step": 366
},
{
"epoch": 1.2092257001647446,
"grad_norm": 0.6672168173906087,
"learning_rate": 4.040454321510554e-06,
"loss": 0.0507,
"step": 367
},
{
"epoch": 1.2125205930807248,
"grad_norm": 0.46572043828398524,
"learning_rate": 4.012184191459832e-06,
"loss": 0.0448,
"step": 368
},
{
"epoch": 1.2158154859967052,
"grad_norm": 0.5294456067061011,
"learning_rate": 3.983946884653804e-06,
"loss": 0.0421,
"step": 369
},
{
"epoch": 1.2191103789126854,
"grad_norm": 0.7181848630920071,
"learning_rate": 3.95574333936459e-06,
"loss": 0.0609,
"step": 370
},
{
"epoch": 1.2224052718286655,
"grad_norm": 0.4872681980462519,
"learning_rate": 3.927574492742473e-06,
"loss": 0.0332,
"step": 371
},
{
"epoch": 1.2257001647446457,
"grad_norm": 0.5978073219647344,
"learning_rate": 3.899441280784773e-06,
"loss": 0.0557,
"step": 372
},
{
"epoch": 1.2289950576606261,
"grad_norm": 0.49268040219816195,
"learning_rate": 3.8713446383047295e-06,
"loss": 0.0539,
"step": 373
},
{
"epoch": 1.2322899505766063,
"grad_norm": 0.553488767277818,
"learning_rate": 3.843285498900457e-06,
"loss": 0.0438,
"step": 374
},
{
"epoch": 1.2355848434925865,
"grad_norm": 0.5769809240481462,
"learning_rate": 3.815264794923903e-06,
"loss": 0.0438,
"step": 375
},
{
"epoch": 1.2388797364085666,
"grad_norm": 0.4680099999633115,
"learning_rate": 3.7872834574498894e-06,
"loss": 0.0391,
"step": 376
},
{
"epoch": 1.242174629324547,
"grad_norm": 0.4990397184455205,
"learning_rate": 3.7593424162451553e-06,
"loss": 0.0513,
"step": 377
},
{
"epoch": 1.2454695222405272,
"grad_norm": 0.5670279278262034,
"learning_rate": 3.731442599737478e-06,
"loss": 0.0611,
"step": 378
},
{
"epoch": 1.2487644151565074,
"grad_norm": 0.4178810778744549,
"learning_rate": 3.70358493498481e-06,
"loss": 0.0461,
"step": 379
},
{
"epoch": 1.2520593080724876,
"grad_norm": 0.5498450231361147,
"learning_rate": 3.6757703476444885e-06,
"loss": 0.0372,
"step": 380
},
{
"epoch": 1.255354200988468,
"grad_norm": 0.45367014770072983,
"learning_rate": 3.6479997619424605e-06,
"loss": 0.0423,
"step": 381
},
{
"epoch": 1.2586490939044481,
"grad_norm": 0.4294200611194709,
"learning_rate": 3.620274100642593e-06,
"loss": 0.0552,
"step": 382
},
{
"epoch": 1.2619439868204283,
"grad_norm": 0.6276700882265509,
"learning_rate": 3.5925942850159895e-06,
"loss": 0.0659,
"step": 383
},
{
"epoch": 1.2652388797364087,
"grad_norm": 0.7113783547292587,
"learning_rate": 3.564961234810399e-06,
"loss": 0.067,
"step": 384
},
{
"epoch": 1.2685337726523889,
"grad_norm": 0.6367177743488461,
"learning_rate": 3.5373758682196347e-06,
"loss": 0.0626,
"step": 385
},
{
"epoch": 1.271828665568369,
"grad_norm": 0.6068919065481327,
"learning_rate": 3.509839101853082e-06,
"loss": 0.0546,
"step": 386
},
{
"epoch": 1.2751235584843492,
"grad_norm": 0.742600911574775,
"learning_rate": 3.4823518507052277e-06,
"loss": 0.061,
"step": 387
},
{
"epoch": 1.2784184514003294,
"grad_norm": 0.4142179254874713,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.0405,
"step": 388
},
{
"epoch": 1.2817133443163098,
"grad_norm": 0.7545310044049625,
"learning_rate": 3.427529545786736e-06,
"loss": 0.055,
"step": 389
},
{
"epoch": 1.28500823723229,
"grad_norm": 0.5556958907162003,
"learning_rate": 3.400196313657253e-06,
"loss": 0.0469,
"step": 390
},
{
"epoch": 1.2883031301482701,
"grad_norm": 0.5082442265119059,
"learning_rate": 3.372916239968246e-06,
"loss": 0.048,
"step": 391
},
{
"epoch": 1.2915980230642505,
"grad_norm": 0.5683891171997948,
"learning_rate": 3.345690231184794e-06,
"loss": 0.0413,
"step": 392
},
{
"epoch": 1.2948929159802307,
"grad_norm": 0.6720011431709395,
"learning_rate": 3.318519191975499e-06,
"loss": 0.0604,
"step": 393
},
{
"epoch": 1.2981878088962109,
"grad_norm": 0.5633138961258451,
"learning_rate": 3.291404025182432e-06,
"loss": 0.065,
"step": 394
},
{
"epoch": 1.301482701812191,
"grad_norm": 0.7184125904469478,
"learning_rate": 3.264345631791127e-06,
"loss": 0.0653,
"step": 395
},
{
"epoch": 1.3047775947281712,
"grad_norm": 0.6249487221408845,
"learning_rate": 3.2373449109006476e-06,
"loss": 0.0476,
"step": 396
},
{
"epoch": 1.3080724876441516,
"grad_norm": 0.8032427870564648,
"learning_rate": 3.210402759693706e-06,
"loss": 0.0731,
"step": 397
},
{
"epoch": 1.3113673805601318,
"grad_norm": 0.56609919677685,
"learning_rate": 3.1835200734068604e-06,
"loss": 0.0484,
"step": 398
},
{
"epoch": 1.314662273476112,
"grad_norm": 0.6259799233731378,
"learning_rate": 3.1566977453007564e-06,
"loss": 0.0621,
"step": 399
},
{
"epoch": 1.3179571663920924,
"grad_norm": 0.5571126132586377,
"learning_rate": 3.1299366666304586e-06,
"loss": 0.0536,
"step": 400
},
{
"epoch": 1.3212520593080725,
"grad_norm": 0.7177292111816991,
"learning_rate": 3.103237726615822e-06,
"loss": 0.0634,
"step": 401
},
{
"epoch": 1.3245469522240527,
"grad_norm": 0.4655411649268851,
"learning_rate": 3.076601812411959e-06,
"loss": 0.0386,
"step": 402
},
{
"epoch": 1.327841845140033,
"grad_norm": 0.5171630532203868,
"learning_rate": 3.0500298090797465e-06,
"loss": 0.0483,
"step": 403
},
{
"epoch": 1.331136738056013,
"grad_norm": 0.624563937797765,
"learning_rate": 3.0235225995564323e-06,
"loss": 0.0556,
"step": 404
},
{
"epoch": 1.3344316309719935,
"grad_norm": 0.5622033908392009,
"learning_rate": 2.9970810646262805e-06,
"loss": 0.0478,
"step": 405
},
{
"epoch": 1.3377265238879736,
"grad_norm": 0.4858535947041361,
"learning_rate": 2.9707060828913226e-06,
"loss": 0.0478,
"step": 406
},
{
"epoch": 1.3410214168039538,
"grad_norm": 0.40196887575712115,
"learning_rate": 2.944398530742144e-06,
"loss": 0.0462,
"step": 407
},
{
"epoch": 1.3443163097199342,
"grad_norm": 0.5650818660979695,
"learning_rate": 2.9181592823287807e-06,
"loss": 0.0655,
"step": 408
},
{
"epoch": 1.3476112026359144,
"grad_norm": 0.5475272072919456,
"learning_rate": 2.8919892095316616e-06,
"loss": 0.0519,
"step": 409
},
{
"epoch": 1.3509060955518946,
"grad_norm": 0.6655092099152591,
"learning_rate": 2.865889181932639e-06,
"loss": 0.0416,
"step": 410
},
{
"epoch": 1.3542009884678747,
"grad_norm": 0.8536644782226072,
"learning_rate": 2.8398600667861032e-06,
"loss": 0.0669,
"step": 411
},
{
"epoch": 1.357495881383855,
"grad_norm": 0.48323829394508544,
"learning_rate": 2.813902728990149e-06,
"loss": 0.0367,
"step": 412
},
{
"epoch": 1.3607907742998353,
"grad_norm": 0.6949809437868909,
"learning_rate": 2.7880180310578546e-06,
"loss": 0.0523,
"step": 413
},
{
"epoch": 1.3640856672158155,
"grad_norm": 0.43770382604271224,
"learning_rate": 2.762206833088608e-06,
"loss": 0.0527,
"step": 414
},
{
"epoch": 1.3673805601317957,
"grad_norm": 0.5081547664603686,
"learning_rate": 2.7364699927395355e-06,
"loss": 0.0613,
"step": 415
},
{
"epoch": 1.370675453047776,
"grad_norm": 0.6539834762026684,
"learning_rate": 2.710808365197e-06,
"loss": 0.0555,
"step": 416
},
{
"epoch": 1.3739703459637562,
"grad_norm": 0.39905302203795334,
"learning_rate": 2.6852228031481837e-06,
"loss": 0.0408,
"step": 417
},
{
"epoch": 1.3772652388797364,
"grad_norm": 0.45958447904532335,
"learning_rate": 2.6597141567527614e-06,
"loss": 0.0503,
"step": 418
},
{
"epoch": 1.3805601317957166,
"grad_norm": 0.4995326285015215,
"learning_rate": 2.6342832736146403e-06,
"loss": 0.0605,
"step": 419
},
{
"epoch": 1.3838550247116967,
"grad_norm": 0.4884609849725302,
"learning_rate": 2.608930998753809e-06,
"loss": 0.0602,
"step": 420
},
{
"epoch": 1.3871499176276771,
"grad_norm": 0.48846990238806653,
"learning_rate": 2.5836581745782474e-06,
"loss": 0.0429,
"step": 421
},
{
"epoch": 1.3904448105436573,
"grad_norm": 0.5807005137969414,
"learning_rate": 2.558465640855943e-06,
"loss": 0.052,
"step": 422
},
{
"epoch": 1.3937397034596375,
"grad_norm": 0.3856666040192254,
"learning_rate": 2.533354234686979e-06,
"loss": 0.0422,
"step": 423
},
{
"epoch": 1.3970345963756179,
"grad_norm": 0.3701363763937253,
"learning_rate": 2.508324790475731e-06,
"loss": 0.0449,
"step": 424
},
{
"epoch": 1.400329489291598,
"grad_norm": 0.5283156125790535,
"learning_rate": 2.4833781399031275e-06,
"loss": 0.0583,
"step": 425
},
{
"epoch": 1.4036243822075782,
"grad_norm": 0.5143083208475716,
"learning_rate": 2.4585151118990286e-06,
"loss": 0.0582,
"step": 426
},
{
"epoch": 1.4069192751235584,
"grad_norm": 0.4580082823859306,
"learning_rate": 2.433736532614666e-06,
"loss": 0.0503,
"step": 427
},
{
"epoch": 1.4102141680395386,
"grad_norm": 0.4733586276806861,
"learning_rate": 2.4090432253952113e-06,
"loss": 0.0595,
"step": 428
},
{
"epoch": 1.413509060955519,
"grad_norm": 0.46027613089003067,
"learning_rate": 2.3844360107523973e-06,
"loss": 0.0334,
"step": 429
},
{
"epoch": 1.4168039538714992,
"grad_norm": 0.618163403358967,
"learning_rate": 2.3599157063372712e-06,
"loss": 0.0505,
"step": 430
},
{
"epoch": 1.4200988467874793,
"grad_norm": 0.5692914543756001,
"learning_rate": 2.3354831269130133e-06,
"loss": 0.047,
"step": 431
},
{
"epoch": 1.4233937397034597,
"grad_norm": 0.5569933619176715,
"learning_rate": 2.3111390843278743e-06,
"loss": 0.0506,
"step": 432
},
{
"epoch": 1.42668863261944,
"grad_norm": 0.4384099356121434,
"learning_rate": 2.2868843874881856e-06,
"loss": 0.0453,
"step": 433
},
{
"epoch": 1.42998352553542,
"grad_norm": 0.5320473584418453,
"learning_rate": 2.2627198423314988e-06,
"loss": 0.0547,
"step": 434
},
{
"epoch": 1.4332784184514002,
"grad_norm": 0.4951776816150561,
"learning_rate": 2.238646251799787e-06,
"loss": 0.0517,
"step": 435
},
{
"epoch": 1.4365733113673804,
"grad_norm": 0.5305051346570233,
"learning_rate": 2.2146644158127827e-06,
"loss": 0.0508,
"step": 436
},
{
"epoch": 1.4398682042833608,
"grad_norm": 0.48235120417487776,
"learning_rate": 2.1907751312413793e-06,
"loss": 0.0498,
"step": 437
},
{
"epoch": 1.443163097199341,
"grad_norm": 0.7575565682766872,
"learning_rate": 2.1669791918811724e-06,
"loss": 0.0482,
"step": 438
},
{
"epoch": 1.4464579901153214,
"grad_norm": 0.6122464829305898,
"learning_rate": 2.1432773884260627e-06,
"loss": 0.0661,
"step": 439
},
{
"epoch": 1.4497528830313016,
"grad_norm": 0.49382428143445756,
"learning_rate": 2.119670508442004e-06,
"loss": 0.0372,
"step": 440
},
{
"epoch": 1.4530477759472817,
"grad_norm": 0.6113296705934868,
"learning_rate": 2.0961593363408154e-06,
"loss": 0.0489,
"step": 441
},
{
"epoch": 1.456342668863262,
"grad_norm": 0.4764803472849658,
"learning_rate": 2.0727446533541302e-06,
"loss": 0.0426,
"step": 442
},
{
"epoch": 1.459637561779242,
"grad_norm": 0.5321931460957434,
"learning_rate": 2.0494272375074247e-06,
"loss": 0.0428,
"step": 443
},
{
"epoch": 1.4629324546952225,
"grad_norm": 0.43368533141343174,
"learning_rate": 2.0262078635941818e-06,
"loss": 0.0377,
"step": 444
},
{
"epoch": 1.4662273476112027,
"grad_norm": 0.5227900476116077,
"learning_rate": 2.0030873031501274e-06,
"loss": 0.048,
"step": 445
},
{
"epoch": 1.4695222405271828,
"grad_norm": 0.40044438580877817,
"learning_rate": 1.980066324427613e-06,
"loss": 0.0367,
"step": 446
},
{
"epoch": 1.4728171334431632,
"grad_norm": 0.42569057497544066,
"learning_rate": 1.9571456923700696e-06,
"loss": 0.0485,
"step": 447
},
{
"epoch": 1.4761120263591434,
"grad_norm": 0.5011955876540544,
"learning_rate": 1.9343261685866054e-06,
"loss": 0.0684,
"step": 448
},
{
"epoch": 1.4794069192751236,
"grad_norm": 0.5257059685422952,
"learning_rate": 1.911608511326688e-06,
"loss": 0.0469,
"step": 449
},
{
"epoch": 1.4827018121911038,
"grad_norm": 0.5330212717649231,
"learning_rate": 1.8889934754549583e-06,
"loss": 0.0615,
"step": 450
},
{
"epoch": 1.485996705107084,
"grad_norm": 0.4377288880184422,
"learning_rate": 1.8664818124261375e-06,
"loss": 0.04,
"step": 451
},
{
"epoch": 1.4892915980230643,
"grad_norm": 0.4821221712040424,
"learning_rate": 1.8440742702600706e-06,
"loss": 0.0496,
"step": 452
},
{
"epoch": 1.4925864909390445,
"grad_norm": 0.42358079608202237,
"learning_rate": 1.8217715935168562e-06,
"loss": 0.0446,
"step": 453
},
{
"epoch": 1.4958813838550247,
"grad_norm": 0.6521628225316723,
"learning_rate": 1.7995745232721207e-06,
"loss": 0.0665,
"step": 454
},
{
"epoch": 1.499176276771005,
"grad_norm": 0.5512352891912379,
"learning_rate": 1.777483797092381e-06,
"loss": 0.0527,
"step": 455
},
{
"epoch": 1.5024711696869852,
"grad_norm": 0.4132207895248971,
"learning_rate": 1.755500149010549e-06,
"loss": 0.0369,
"step": 456
},
{
"epoch": 1.5057660626029654,
"grad_norm": 0.5452488198197322,
"learning_rate": 1.7336243095015271e-06,
"loss": 0.0457,
"step": 457
},
{
"epoch": 1.5090609555189456,
"grad_norm": 1.8013972479013802,
"learning_rate": 1.7118570054579508e-06,
"loss": 0.0788,
"step": 458
},
{
"epoch": 1.5123558484349258,
"grad_norm": 0.6158971711077378,
"learning_rate": 1.6901989601660224e-06,
"loss": 0.0577,
"step": 459
},
{
"epoch": 1.515650741350906,
"grad_norm": 0.6402888520963839,
"learning_rate": 1.6686508932814871e-06,
"loss": 0.0426,
"step": 460
},
{
"epoch": 1.5189456342668863,
"grad_norm": 0.5815365915637473,
"learning_rate": 1.6472135208057128e-06,
"loss": 0.0526,
"step": 461
},
{
"epoch": 1.5222405271828665,
"grad_norm": 0.5219074399966507,
"learning_rate": 1.625887555061907e-06,
"loss": 0.0428,
"step": 462
},
{
"epoch": 1.525535420098847,
"grad_norm": 0.5007230705662209,
"learning_rate": 1.6046737046714366e-06,
"loss": 0.0386,
"step": 463
},
{
"epoch": 1.528830313014827,
"grad_norm": 0.492364769802372,
"learning_rate": 1.5835726745302953e-06,
"loss": 0.0364,
"step": 464
},
{
"epoch": 1.5321252059308073,
"grad_norm": 0.5652000556154251,
"learning_rate": 1.5625851657856666e-06,
"loss": 0.0546,
"step": 465
},
{
"epoch": 1.5354200988467874,
"grad_norm": 0.5993897339775979,
"learning_rate": 1.5417118758126408e-06,
"loss": 0.0579,
"step": 466
},
{
"epoch": 1.5387149917627676,
"grad_norm": 1.3251444571487765,
"learning_rate": 1.520953498191028e-06,
"loss": 0.0747,
"step": 467
},
{
"epoch": 1.5420098846787478,
"grad_norm": 0.440371155414081,
"learning_rate": 1.5003107226823255e-06,
"loss": 0.0495,
"step": 468
},
{
"epoch": 1.5453047775947282,
"grad_norm": 0.5274460518323345,
"learning_rate": 1.479784235206786e-06,
"loss": 0.0457,
"step": 469
},
{
"epoch": 1.5485996705107083,
"grad_norm": 0.4509159507608483,
"learning_rate": 1.459374717820637e-06,
"loss": 0.0441,
"step": 470
},
{
"epoch": 1.5518945634266887,
"grad_norm": 0.5787329784185842,
"learning_rate": 1.439082848693406e-06,
"loss": 0.0455,
"step": 471
},
{
"epoch": 1.555189456342669,
"grad_norm": 0.9428413760935695,
"learning_rate": 1.4189093020853989e-06,
"loss": 0.0635,
"step": 472
},
{
"epoch": 1.558484349258649,
"grad_norm": 0.5810607886116554,
"learning_rate": 1.3988547483252812e-06,
"loss": 0.0591,
"step": 473
},
{
"epoch": 1.5617792421746293,
"grad_norm": 0.5621586581333317,
"learning_rate": 1.3789198537878202e-06,
"loss": 0.048,
"step": 474
},
{
"epoch": 1.5650741350906094,
"grad_norm": 0.614476271893997,
"learning_rate": 1.3591052808717258e-06,
"loss": 0.0574,
"step": 475
},
{
"epoch": 1.5683690280065898,
"grad_norm": 0.4366280882804736,
"learning_rate": 1.339411687977657e-06,
"loss": 0.0387,
"step": 476
},
{
"epoch": 1.57166392092257,
"grad_norm": 1.654229386447125,
"learning_rate": 1.3198397294863285e-06,
"loss": 0.0525,
"step": 477
},
{
"epoch": 1.5749588138385504,
"grad_norm": 0.5124984935464315,
"learning_rate": 1.3003900557367816e-06,
"loss": 0.0586,
"step": 478
},
{
"epoch": 1.5782537067545306,
"grad_norm": 0.5039902746309534,
"learning_rate": 1.281063313004761e-06,
"loss": 0.0409,
"step": 479
},
{
"epoch": 1.5815485996705108,
"grad_norm": 0.4453799136874429,
"learning_rate": 1.261860143481255e-06,
"loss": 0.0437,
"step": 480
},
{
"epoch": 1.584843492586491,
"grad_norm": 0.44736265726220936,
"learning_rate": 1.2427811852511396e-06,
"loss": 0.05,
"step": 481
},
{
"epoch": 1.588138385502471,
"grad_norm": 0.5751552043472024,
"learning_rate": 1.223827072271993e-06,
"loss": 0.0513,
"step": 482
},
{
"epoch": 1.5914332784184513,
"grad_norm": 0.4854076213664054,
"learning_rate": 1.204998434353018e-06,
"loss": 0.0434,
"step": 483
},
{
"epoch": 1.5947281713344317,
"grad_norm": 0.5304616858985192,
"learning_rate": 1.1862958971341199e-06,
"loss": 0.0537,
"step": 484
},
{
"epoch": 1.5980230642504119,
"grad_norm": 0.5357970833666896,
"learning_rate": 1.1677200820651197e-06,
"loss": 0.049,
"step": 485
},
{
"epoch": 1.6013179571663922,
"grad_norm": 0.6703644083736745,
"learning_rate": 1.1492716063850973e-06,
"loss": 0.0553,
"step": 486
},
{
"epoch": 1.6046128500823724,
"grad_norm": 0.5104289346948437,
"learning_rate": 1.1309510831018927e-06,
"loss": 0.0484,
"step": 487
},
{
"epoch": 1.6079077429983526,
"grad_norm": 0.791691486031595,
"learning_rate": 1.112759120971723e-06,
"loss": 0.0516,
"step": 488
},
{
"epoch": 1.6112026359143328,
"grad_norm": 0.5044446696201748,
"learning_rate": 1.09469632447897e-06,
"loss": 0.0412,
"step": 489
},
{
"epoch": 1.614497528830313,
"grad_norm": 0.44744812374789733,
"learning_rate": 1.0767632938160787e-06,
"loss": 0.0441,
"step": 490
},
{
"epoch": 1.6177924217462931,
"grad_norm": 0.689652904031741,
"learning_rate": 1.0589606248636291e-06,
"loss": 0.0468,
"step": 491
},
{
"epoch": 1.6210873146622735,
"grad_norm": 0.4644587386351254,
"learning_rate": 1.0412889091705242e-06,
"loss": 0.0356,
"step": 492
},
{
"epoch": 1.6243822075782537,
"grad_norm": 0.6499107202235256,
"learning_rate": 1.0237487339343382e-06,
"loss": 0.0574,
"step": 493
},
{
"epoch": 1.627677100494234,
"grad_norm": 0.4542177117918383,
"learning_rate": 1.0063406819818106e-06,
"loss": 0.0443,
"step": 494
},
{
"epoch": 1.6309719934102143,
"grad_norm": 0.6343789726555299,
"learning_rate": 9.890653317494686e-07,
"loss": 0.0524,
"step": 495
},
{
"epoch": 1.6342668863261944,
"grad_norm": 0.4208852075289343,
"learning_rate": 9.719232572644189e-07,
"loss": 0.0407,
"step": 496
},
{
"epoch": 1.6375617792421746,
"grad_norm": 0.45018417664569393,
"learning_rate": 9.549150281252633e-07,
"loss": 0.0382,
"step": 497
},
{
"epoch": 1.6408566721581548,
"grad_norm": 0.4664038740894182,
"learning_rate": 9.380412094831809e-07,
"loss": 0.0413,
"step": 498
},
{
"epoch": 1.644151565074135,
"grad_norm": 0.5658115763517576,
"learning_rate": 9.213023620231404e-07,
"loss": 0.055,
"step": 499
},
{
"epoch": 1.6474464579901154,
"grad_norm": 0.42505631586549236,
"learning_rate": 9.046990419452795e-07,
"loss": 0.0374,
"step": 500
},
{
"epoch": 1.6507413509060955,
"grad_norm": 0.5554370313066022,
"learning_rate": 8.882318009464124e-07,
"loss": 0.0758,
"step": 501
},
{
"epoch": 1.654036243822076,
"grad_norm": 0.6820183163387327,
"learning_rate": 8.719011862017108e-07,
"loss": 0.067,
"step": 502
},
{
"epoch": 1.657331136738056,
"grad_norm": 0.8815741831945997,
"learning_rate": 8.557077403465069e-07,
"loss": 0.0635,
"step": 503
},
{
"epoch": 1.6606260296540363,
"grad_norm": 0.6530261534927284,
"learning_rate": 8.396520014582798e-07,
"loss": 0.0564,
"step": 504
},
{
"epoch": 1.6639209225700164,
"grad_norm": 0.5563910425802013,
"learning_rate": 8.237345030387589e-07,
"loss": 0.0568,
"step": 505
},
{
"epoch": 1.6672158154859966,
"grad_norm": 1.6616538016948608,
"learning_rate": 8.079557739962129e-07,
"loss": 0.0433,
"step": 506
},
{
"epoch": 1.6705107084019768,
"grad_norm": 0.4729743527848457,
"learning_rate": 7.923163386278615e-07,
"loss": 0.0477,
"step": 507
},
{
"epoch": 1.6738056013179572,
"grad_norm": 0.484207261501026,
"learning_rate": 7.768167166024637e-07,
"loss": 0.0393,
"step": 508
},
{
"epoch": 1.6771004942339374,
"grad_norm": 0.4347790202564516,
"learning_rate": 7.614574229430432e-07,
"loss": 0.0348,
"step": 509
},
{
"epoch": 1.6803953871499178,
"grad_norm": 0.5159230901740568,
"learning_rate": 7.462389680097831e-07,
"loss": 0.0511,
"step": 510
},
{
"epoch": 1.683690280065898,
"grad_norm": 0.7846302974584749,
"learning_rate": 7.31161857483057e-07,
"loss": 0.0428,
"step": 511
},
{
"epoch": 1.6869851729818781,
"grad_norm": 0.5154541689981792,
"learning_rate": 7.162265923466383e-07,
"loss": 0.0481,
"step": 512
},
{
"epoch": 1.6902800658978583,
"grad_norm": 0.5103769113667321,
"learning_rate": 7.014336688710411e-07,
"loss": 0.0559,
"step": 513
},
{
"epoch": 1.6935749588138385,
"grad_norm": 0.5601518054326986,
"learning_rate": 6.867835785970417e-07,
"loss": 0.0383,
"step": 514
},
{
"epoch": 1.6968698517298186,
"grad_norm": 0.452690076025677,
"learning_rate": 6.722768083193354e-07,
"loss": 0.0393,
"step": 515
},
{
"epoch": 1.700164744645799,
"grad_norm": 0.46134612749678455,
"learning_rate": 6.579138400703716e-07,
"loss": 0.0515,
"step": 516
},
{
"epoch": 1.7034596375617792,
"grad_norm": 0.41746324511751276,
"learning_rate": 6.436951511043243e-07,
"loss": 0.0445,
"step": 517
},
{
"epoch": 1.7067545304777596,
"grad_norm": 0.45014253067830906,
"learning_rate": 6.296212138812474e-07,
"loss": 0.0438,
"step": 518
},
{
"epoch": 1.7100494233937398,
"grad_norm": 0.49892377023307155,
"learning_rate": 6.156924960513638e-07,
"loss": 0.0452,
"step": 519
},
{
"epoch": 1.71334431630972,
"grad_norm": 0.47105725038366275,
"learning_rate": 6.019094604395359e-07,
"loss": 0.054,
"step": 520
},
{
"epoch": 1.7166392092257001,
"grad_norm": 0.4259013974972623,
"learning_rate": 5.882725650298787e-07,
"loss": 0.0377,
"step": 521
},
{
"epoch": 1.7199341021416803,
"grad_norm": 0.7122055487087868,
"learning_rate": 5.747822629505484e-07,
"loss": 0.0606,
"step": 522
},
{
"epoch": 1.7232289950576605,
"grad_norm": 0.6463022317948001,
"learning_rate": 5.614390024586808e-07,
"loss": 0.0948,
"step": 523
},
{
"epoch": 1.7265238879736409,
"grad_norm": 0.6557315175026269,
"learning_rate": 5.482432269255011e-07,
"loss": 0.0594,
"step": 524
},
{
"epoch": 1.729818780889621,
"grad_norm": 0.48159732485479767,
"learning_rate": 5.351953748215872e-07,
"loss": 0.0562,
"step": 525
},
{
"epoch": 1.7331136738056014,
"grad_norm": 0.5429610524246544,
"learning_rate": 5.222958797023036e-07,
"loss": 0.0469,
"step": 526
},
{
"epoch": 1.7364085667215816,
"grad_norm": 0.4703329992162841,
"learning_rate": 5.095451701933923e-07,
"loss": 0.0495,
"step": 527
},
{
"epoch": 1.7397034596375618,
"grad_norm": 0.4436763771779832,
"learning_rate": 4.969436699767344e-07,
"loss": 0.0354,
"step": 528
},
{
"epoch": 1.742998352553542,
"grad_norm": 0.5025190325700492,
"learning_rate": 4.844917977762653e-07,
"loss": 0.056,
"step": 529
},
{
"epoch": 1.7462932454695221,
"grad_norm": 0.661761525862607,
"learning_rate": 4.721899673440694e-07,
"loss": 0.0436,
"step": 530
},
{
"epoch": 1.7495881383855023,
"grad_norm": 0.5762138308981635,
"learning_rate": 4.6003858744662564e-07,
"loss": 0.0552,
"step": 531
},
{
"epoch": 1.7528830313014827,
"grad_norm": 0.5387051088420545,
"learning_rate": 4.4803806185122866e-07,
"loss": 0.0479,
"step": 532
},
{
"epoch": 1.7561779242174629,
"grad_norm": 0.537079178923195,
"learning_rate": 4.361887893125677e-07,
"loss": 0.0565,
"step": 533
},
{
"epoch": 1.7594728171334433,
"grad_norm": 0.5780295144664594,
"learning_rate": 4.244911635594856e-07,
"loss": 0.0555,
"step": 534
},
{
"epoch": 1.7627677100494235,
"grad_norm": 0.6971174595241891,
"learning_rate": 4.1294557328188376e-07,
"loss": 0.0468,
"step": 535
},
{
"epoch": 1.7660626029654036,
"grad_norm": 0.5714544842706962,
"learning_rate": 4.0155240211781966e-07,
"loss": 0.069,
"step": 536
},
{
"epoch": 1.7693574958813838,
"grad_norm": 0.5560933792076089,
"learning_rate": 3.9031202864074634e-07,
"loss": 0.0526,
"step": 537
},
{
"epoch": 1.772652388797364,
"grad_norm": 0.5076535062447541,
"learning_rate": 3.7922482634694667e-07,
"loss": 0.0495,
"step": 538
},
{
"epoch": 1.7759472817133442,
"grad_norm": 0.4733107950145234,
"learning_rate": 3.6829116364310914e-07,
"loss": 0.048,
"step": 539
},
{
"epoch": 1.7792421746293245,
"grad_norm": 0.5645137566585913,
"learning_rate": 3.575114038340977e-07,
"loss": 0.0503,
"step": 540
},
{
"epoch": 1.782537067545305,
"grad_norm": 0.5224355605212421,
"learning_rate": 3.4688590511087304e-07,
"loss": 0.0553,
"step": 541
},
{
"epoch": 1.7858319604612851,
"grad_norm": 0.495152224806279,
"learning_rate": 3.3641502053859355e-07,
"loss": 0.0304,
"step": 542
},
{
"epoch": 1.7891268533772653,
"grad_norm": 0.40973303317244536,
"learning_rate": 3.2609909804488195e-07,
"loss": 0.0313,
"step": 543
},
{
"epoch": 1.7924217462932455,
"grad_norm": 0.47058046777642437,
"learning_rate": 3.159384804082666e-07,
"loss": 0.0526,
"step": 544
},
{
"epoch": 1.7957166392092256,
"grad_norm": 0.4838577514212764,
"learning_rate": 3.0593350524678823e-07,
"loss": 0.0371,
"step": 545
},
{
"epoch": 1.7990115321252058,
"grad_norm": 0.6690590772761237,
"learning_rate": 2.9608450500678566e-07,
"loss": 0.0604,
"step": 546
},
{
"epoch": 1.8023064250411862,
"grad_norm": 0.4109056760035354,
"learning_rate": 2.863918069518451e-07,
"loss": 0.0331,
"step": 547
},
{
"epoch": 1.8056013179571664,
"grad_norm": 0.7159007380510568,
"learning_rate": 2.7685573315192895e-07,
"loss": 0.0721,
"step": 548
},
{
"epoch": 1.8088962108731468,
"grad_norm": 0.4865674890840018,
"learning_rate": 2.67476600472672e-07,
"loss": 0.0451,
"step": 549
},
{
"epoch": 1.812191103789127,
"grad_norm": 0.5942738860891101,
"learning_rate": 2.5825472056485556e-07,
"loss": 0.062,
"step": 550
},
{
"epoch": 1.8154859967051071,
"grad_norm": 0.5357536121944303,
"learning_rate": 2.4919039985404626e-07,
"loss": 0.0609,
"step": 551
},
{
"epoch": 1.8187808896210873,
"grad_norm": 0.3866866163849165,
"learning_rate": 2.4028393953042074e-07,
"loss": 0.0296,
"step": 552
},
{
"epoch": 1.8220757825370675,
"grad_norm": 0.49750309911350266,
"learning_rate": 2.315356355387527e-07,
"loss": 0.0444,
"step": 553
},
{
"epoch": 1.8253706754530477,
"grad_norm": 0.6734726196583376,
"learning_rate": 2.2294577856858236e-07,
"loss": 0.0552,
"step": 554
},
{
"epoch": 1.828665568369028,
"grad_norm": 0.4599661443293077,
"learning_rate": 2.1451465404455473e-07,
"loss": 0.041,
"step": 555
},
{
"epoch": 1.8319604612850082,
"grad_norm": 0.691277736971953,
"learning_rate": 2.0624254211693894e-07,
"loss": 0.061,
"step": 556
},
{
"epoch": 1.8352553542009886,
"grad_norm": 0.42780093096680555,
"learning_rate": 1.9812971765231394e-07,
"loss": 0.0364,
"step": 557
},
{
"epoch": 1.8385502471169688,
"grad_norm": 0.578589972444535,
"learning_rate": 1.901764502244424e-07,
"loss": 0.0613,
"step": 558
},
{
"epoch": 1.841845140032949,
"grad_norm": 0.4763862013898226,
"learning_rate": 1.823830041053065e-07,
"loss": 0.0437,
"step": 559
},
{
"epoch": 1.8451400329489291,
"grad_norm": 0.45718093901396817,
"learning_rate": 1.7474963825633185e-07,
"loss": 0.043,
"step": 560
},
{
"epoch": 1.8484349258649093,
"grad_norm": 0.6366482024659451,
"learning_rate": 1.6727660631977894e-07,
"loss": 0.0556,
"step": 561
},
{
"epoch": 1.8517298187808895,
"grad_norm": 0.7471772626827173,
"learning_rate": 1.5996415661031662e-07,
"loss": 0.0425,
"step": 562
},
{
"epoch": 1.8550247116968699,
"grad_norm": 0.3521582825319659,
"learning_rate": 1.528125321067725e-07,
"loss": 0.0382,
"step": 563
},
{
"epoch": 1.85831960461285,
"grad_norm": 0.5973061078363581,
"learning_rate": 1.4582197044405556e-07,
"loss": 0.0509,
"step": 564
},
{
"epoch": 1.8616144975288305,
"grad_norm": 0.5093523377380892,
"learning_rate": 1.389927039052652e-07,
"loss": 0.0444,
"step": 565
},
{
"epoch": 1.8649093904448106,
"grad_norm": 0.44171165477963154,
"learning_rate": 1.323249594139664e-07,
"loss": 0.0468,
"step": 566
},
{
"epoch": 1.8682042833607908,
"grad_norm": 0.44675757705445573,
"learning_rate": 1.2581895852665671e-07,
"loss": 0.0374,
"step": 567
},
{
"epoch": 1.871499176276771,
"grad_norm": 0.6020514933209717,
"learning_rate": 1.1947491742539841e-07,
"loss": 0.0503,
"step": 568
},
{
"epoch": 1.8747940691927512,
"grad_norm": 0.42586058680868216,
"learning_rate": 1.1329304691063692e-07,
"loss": 0.0392,
"step": 569
},
{
"epoch": 1.8780889621087313,
"grad_norm": 0.4423611658945744,
"learning_rate": 1.0727355239419868e-07,
"loss": 0.0469,
"step": 570
},
{
"epoch": 1.8813838550247117,
"grad_norm": 0.424883733072018,
"learning_rate": 1.014166338924627e-07,
"loss": 0.0475,
"step": 571
},
{
"epoch": 1.884678747940692,
"grad_norm": 0.508275043696402,
"learning_rate": 9.572248601971646e-08,
"loss": 0.0588,
"step": 572
},
{
"epoch": 1.8879736408566723,
"grad_norm": 0.532268589710946,
"learning_rate": 9.019129798168658e-08,
"loss": 0.0413,
"step": 573
},
{
"epoch": 1.8912685337726525,
"grad_norm": 0.49075007985093444,
"learning_rate": 8.482325356925614e-08,
"loss": 0.0438,
"step": 574
},
{
"epoch": 1.8945634266886326,
"grad_norm": 0.6405783999237776,
"learning_rate": 7.96185311523523e-08,
"loss": 0.0517,
"step": 575
},
{
"epoch": 1.8978583196046128,
"grad_norm": 0.4374824821968711,
"learning_rate": 7.45773036740255e-08,
"loss": 0.0594,
"step": 576
},
{
"epoch": 1.901153212520593,
"grad_norm": 0.4881691605292657,
"learning_rate": 6.969973864469626e-08,
"loss": 0.0478,
"step": 577
},
{
"epoch": 1.9044481054365732,
"grad_norm": 0.4169109199296669,
"learning_rate": 6.498599813659524e-08,
"loss": 0.0329,
"step": 578
},
{
"epoch": 1.9077429983525536,
"grad_norm": 0.7757102581030492,
"learning_rate": 6.043623877837301e-08,
"loss": 0.0459,
"step": 579
},
{
"epoch": 1.9110378912685337,
"grad_norm": 0.6871632648014142,
"learning_rate": 5.6050611749899896e-08,
"loss": 0.0499,
"step": 580
},
{
"epoch": 1.9143327841845141,
"grad_norm": 0.4909555169664079,
"learning_rate": 5.182926277723821e-08,
"loss": 0.0439,
"step": 581
},
{
"epoch": 1.9176276771004943,
"grad_norm": 0.5085351517406403,
"learning_rate": 4.777233212780396e-08,
"loss": 0.043,
"step": 582
},
{
"epoch": 1.9209225700164745,
"grad_norm": 0.4133261269232907,
"learning_rate": 4.387995460570282e-08,
"loss": 0.0422,
"step": 583
},
{
"epoch": 1.9242174629324547,
"grad_norm": 0.3457486280213094,
"learning_rate": 4.015225954725421e-08,
"loss": 0.0302,
"step": 584
},
{
"epoch": 1.9275123558484348,
"grad_norm": 0.5306123017488412,
"learning_rate": 3.658937081669034e-08,
"loss": 0.0347,
"step": 585
},
{
"epoch": 1.930807248764415,
"grad_norm": 0.6170841106215647,
"learning_rate": 3.3191406802041693e-08,
"loss": 0.0427,
"step": 586
},
{
"epoch": 1.9341021416803954,
"grad_norm": 0.45743745755641063,
"learning_rate": 2.9958480411204086e-08,
"loss": 0.0487,
"step": 587
},
{
"epoch": 1.9373970345963756,
"grad_norm": 0.7985516650647139,
"learning_rate": 2.6890699068187197e-08,
"loss": 0.0598,
"step": 588
},
{
"epoch": 1.940691927512356,
"grad_norm": 0.969435591211712,
"learning_rate": 2.3988164709542462e-08,
"loss": 0.046,
"step": 589
},
{
"epoch": 1.9439868204283361,
"grad_norm": 0.5530499902423203,
"learning_rate": 2.1250973780977957e-08,
"loss": 0.0626,
"step": 590
},
{
"epoch": 1.9472817133443163,
"grad_norm": 0.4726855774076432,
"learning_rate": 1.8679217234154335e-08,
"loss": 0.0442,
"step": 591
},
{
"epoch": 1.9505766062602965,
"grad_norm": 0.5910043256950622,
"learning_rate": 1.627298052366111e-08,
"loss": 0.0533,
"step": 592
},
{
"epoch": 1.9538714991762767,
"grad_norm": 0.37046514170506584,
"learning_rate": 1.4032343604177267e-08,
"loss": 0.0436,
"step": 593
},
{
"epoch": 1.9571663920922568,
"grad_norm": 0.45020576348906904,
"learning_rate": 1.1957380927816176e-08,
"loss": 0.0392,
"step": 594
},
{
"epoch": 1.9604612850082372,
"grad_norm": 0.4456283814258553,
"learning_rate": 1.0048161441649217e-08,
"loss": 0.0464,
"step": 595
},
{
"epoch": 1.9637561779242174,
"grad_norm": 0.5191136185767354,
"learning_rate": 8.304748585417077e-09,
"loss": 0.0432,
"step": 596
},
{
"epoch": 1.9670510708401978,
"grad_norm": 0.7541935440652175,
"learning_rate": 6.72720028942031e-09,
"loss": 0.0417,
"step": 597
},
{
"epoch": 1.970345963756178,
"grad_norm": 0.4930059149169314,
"learning_rate": 5.315568972594775e-09,
"loss": 0.0522,
"step": 598
},
{
"epoch": 1.9736408566721582,
"grad_norm": 0.43182626877243907,
"learning_rate": 4.0699015407702495e-09,
"loss": 0.0426,
"step": 599
},
{
"epoch": 1.9769357495881383,
"grad_norm": 0.5518177594295087,
"learning_rate": 2.990239385112226e-09,
"loss": 0.0565,
"step": 600
},
{
"epoch": 1.9802306425041185,
"grad_norm": 0.5188892218788691,
"learning_rate": 2.076618380744133e-09,
"loss": 0.0684,
"step": 601
},
{
"epoch": 1.9835255354200987,
"grad_norm": 0.43199397069665996,
"learning_rate": 1.3290688855588374e-09,
"loss": 0.0396,
"step": 602
},
{
"epoch": 1.986820428336079,
"grad_norm": 0.4470921544911916,
"learning_rate": 7.476157392072303e-10,
"loss": 0.0385,
"step": 603
},
{
"epoch": 1.9901153212520593,
"grad_norm": 0.5332848840635471,
"learning_rate": 3.322782622738885e-10,
"loss": 0.0585,
"step": 604
},
{
"epoch": 1.9934102141680397,
"grad_norm": 0.6284366325020901,
"learning_rate": 8.307025563536464e-11,
"loss": 0.0517,
"step": 605
},
{
"epoch": 1.9967051070840198,
"grad_norm": 0.6056246452486423,
"learning_rate": 0.0,
"loss": 0.048,
"step": 606
},
{
"epoch": 1.9967051070840198,
"eval_loss": 0.0587012954056263,
"eval_runtime": 144.0131,
"eval_samples_per_second": 35.448,
"eval_steps_per_second": 1.111,
"step": 606
},
{
"epoch": 1.9967051070840198,
"step": 606,
"total_flos": 1.811911707237417e+17,
"train_loss": 0.0716345354195426,
"train_runtime": 6724.9995,
"train_samples_per_second": 11.537,
"train_steps_per_second": 0.09
}
],
"logging_steps": 1,
"max_steps": 606,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.811911707237417e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}