Nexspear's picture
Training in progress, step 350, checkpoint
feaef42 verified
raw
history blame
62.7 kB
{
"best_metric": 0.32943063974380493,
"best_model_checkpoint": "miner_id_24/checkpoint-350",
"epoch": 2.908713692946058,
"eval_steps": 50,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008298755186721992,
"grad_norm": 11.30918025970459,
"learning_rate": 1e-05,
"loss": 4.6137,
"step": 1
},
{
"epoch": 0.008298755186721992,
"eval_loss": 5.352054595947266,
"eval_runtime": 7.0594,
"eval_samples_per_second": 28.756,
"eval_steps_per_second": 7.224,
"step": 1
},
{
"epoch": 0.016597510373443983,
"grad_norm": 13.656156539916992,
"learning_rate": 2e-05,
"loss": 4.4637,
"step": 2
},
{
"epoch": 0.024896265560165973,
"grad_norm": 12.059760093688965,
"learning_rate": 3e-05,
"loss": 4.3758,
"step": 3
},
{
"epoch": 0.03319502074688797,
"grad_norm": 10.070697784423828,
"learning_rate": 4e-05,
"loss": 4.6825,
"step": 4
},
{
"epoch": 0.04149377593360996,
"grad_norm": 10.277752876281738,
"learning_rate": 5e-05,
"loss": 4.4531,
"step": 5
},
{
"epoch": 0.04979253112033195,
"grad_norm": 8.195782661437988,
"learning_rate": 6e-05,
"loss": 3.9268,
"step": 6
},
{
"epoch": 0.058091286307053944,
"grad_norm": 9.387673377990723,
"learning_rate": 7e-05,
"loss": 3.7347,
"step": 7
},
{
"epoch": 0.06639004149377593,
"grad_norm": 8.870927810668945,
"learning_rate": 8e-05,
"loss": 3.4846,
"step": 8
},
{
"epoch": 0.07468879668049792,
"grad_norm": 6.948357582092285,
"learning_rate": 9e-05,
"loss": 3.3592,
"step": 9
},
{
"epoch": 0.08298755186721991,
"grad_norm": 5.063608169555664,
"learning_rate": 0.0001,
"loss": 2.8956,
"step": 10
},
{
"epoch": 0.0912863070539419,
"grad_norm": 14.618287086486816,
"learning_rate": 9.999799726899262e-05,
"loss": 2.92,
"step": 11
},
{
"epoch": 0.0995850622406639,
"grad_norm": 18.60139274597168,
"learning_rate": 9.999198923640774e-05,
"loss": 3.0395,
"step": 12
},
{
"epoch": 0.1078838174273859,
"grad_norm": 16.003385543823242,
"learning_rate": 9.998197638354428e-05,
"loss": 2.8771,
"step": 13
},
{
"epoch": 0.11618257261410789,
"grad_norm": 7.36771297454834,
"learning_rate": 9.996795951252427e-05,
"loss": 2.5778,
"step": 14
},
{
"epoch": 0.12448132780082988,
"grad_norm": 7.226404190063477,
"learning_rate": 9.994993974622862e-05,
"loss": 2.9167,
"step": 15
},
{
"epoch": 0.13278008298755187,
"grad_norm": 9.766325950622559,
"learning_rate": 9.992791852820709e-05,
"loss": 2.9431,
"step": 16
},
{
"epoch": 0.14107883817427386,
"grad_norm": 5.852901935577393,
"learning_rate": 9.990189762256276e-05,
"loss": 2.3363,
"step": 17
},
{
"epoch": 0.14937759336099585,
"grad_norm": 5.548067569732666,
"learning_rate": 9.987187911381059e-05,
"loss": 2.237,
"step": 18
},
{
"epoch": 0.15767634854771784,
"grad_norm": 6.030773639678955,
"learning_rate": 9.983786540671051e-05,
"loss": 2.1454,
"step": 19
},
{
"epoch": 0.16597510373443983,
"grad_norm": 5.069052696228027,
"learning_rate": 9.979985922607475e-05,
"loss": 1.971,
"step": 20
},
{
"epoch": 0.17427385892116182,
"grad_norm": 5.834778308868408,
"learning_rate": 9.97578636165496e-05,
"loss": 2.5553,
"step": 21
},
{
"epoch": 0.1825726141078838,
"grad_norm": 4.660792350769043,
"learning_rate": 9.97118819423714e-05,
"loss": 2.2076,
"step": 22
},
{
"epoch": 0.1908713692946058,
"grad_norm": 4.532413959503174,
"learning_rate": 9.966191788709716e-05,
"loss": 2.1984,
"step": 23
},
{
"epoch": 0.1991701244813278,
"grad_norm": 6.078835964202881,
"learning_rate": 9.960797545330936e-05,
"loss": 2.5873,
"step": 24
},
{
"epoch": 0.2074688796680498,
"grad_norm": 5.118812084197998,
"learning_rate": 9.955005896229543e-05,
"loss": 2.7898,
"step": 25
},
{
"epoch": 0.2157676348547718,
"grad_norm": 5.2867231369018555,
"learning_rate": 9.948817305370143e-05,
"loss": 2.2812,
"step": 26
},
{
"epoch": 0.22406639004149378,
"grad_norm": 4.749370098114014,
"learning_rate": 9.94223226851605e-05,
"loss": 2.4715,
"step": 27
},
{
"epoch": 0.23236514522821577,
"grad_norm": 4.7184977531433105,
"learning_rate": 9.935251313189564e-05,
"loss": 2.3676,
"step": 28
},
{
"epoch": 0.24066390041493776,
"grad_norm": 5.203877925872803,
"learning_rate": 9.927874998629714e-05,
"loss": 2.4597,
"step": 29
},
{
"epoch": 0.24896265560165975,
"grad_norm": 7.009748935699463,
"learning_rate": 9.92010391574745e-05,
"loss": 2.7419,
"step": 30
},
{
"epoch": 0.2572614107883817,
"grad_norm": 4.794604301452637,
"learning_rate": 9.911938687078324e-05,
"loss": 2.4883,
"step": 31
},
{
"epoch": 0.26556016597510373,
"grad_norm": 4.864479064941406,
"learning_rate": 9.9033799667326e-05,
"loss": 2.2448,
"step": 32
},
{
"epoch": 0.27385892116182575,
"grad_norm": 4.177216529846191,
"learning_rate": 9.89442844034286e-05,
"loss": 2.1125,
"step": 33
},
{
"epoch": 0.2821576763485477,
"grad_norm": 3.661107063293457,
"learning_rate": 9.885084825009086e-05,
"loss": 1.8155,
"step": 34
},
{
"epoch": 0.29045643153526973,
"grad_norm": 4.322318077087402,
"learning_rate": 9.875349869241201e-05,
"loss": 2.1389,
"step": 35
},
{
"epoch": 0.2987551867219917,
"grad_norm": 4.690096378326416,
"learning_rate": 9.865224352899119e-05,
"loss": 2.0913,
"step": 36
},
{
"epoch": 0.3070539419087137,
"grad_norm": 4.660186767578125,
"learning_rate": 9.85470908713026e-05,
"loss": 2.1136,
"step": 37
},
{
"epoch": 0.3153526970954357,
"grad_norm": 4.324281215667725,
"learning_rate": 9.843804914304577e-05,
"loss": 2.2243,
"step": 38
},
{
"epoch": 0.3236514522821577,
"grad_norm": 4.355935096740723,
"learning_rate": 9.83251270794707e-05,
"loss": 2.0655,
"step": 39
},
{
"epoch": 0.33195020746887965,
"grad_norm": 4.537855625152588,
"learning_rate": 9.820833372667812e-05,
"loss": 2.4219,
"step": 40
},
{
"epoch": 0.34024896265560167,
"grad_norm": 3.879166603088379,
"learning_rate": 9.80876784408948e-05,
"loss": 2.0775,
"step": 41
},
{
"epoch": 0.34854771784232363,
"grad_norm": 4.542738437652588,
"learning_rate": 9.796317088772403e-05,
"loss": 2.3691,
"step": 42
},
{
"epoch": 0.35684647302904565,
"grad_norm": 3.76499342918396,
"learning_rate": 9.783482104137127e-05,
"loss": 1.6421,
"step": 43
},
{
"epoch": 0.3651452282157676,
"grad_norm": 4.638901233673096,
"learning_rate": 9.770263918384524e-05,
"loss": 2.1609,
"step": 44
},
{
"epoch": 0.37344398340248963,
"grad_norm": 4.964886665344238,
"learning_rate": 9.75666359041341e-05,
"loss": 1.5742,
"step": 45
},
{
"epoch": 0.3817427385892116,
"grad_norm": 4.453831672668457,
"learning_rate": 9.742682209735727e-05,
"loss": 2.2456,
"step": 46
},
{
"epoch": 0.3900414937759336,
"grad_norm": 5.158775806427002,
"learning_rate": 9.728320896389263e-05,
"loss": 2.1847,
"step": 47
},
{
"epoch": 0.3983402489626556,
"grad_norm": 5.11721134185791,
"learning_rate": 9.713580800847916e-05,
"loss": 1.7405,
"step": 48
},
{
"epoch": 0.4066390041493776,
"grad_norm": 5.306658744812012,
"learning_rate": 9.698463103929542e-05,
"loss": 1.6412,
"step": 49
},
{
"epoch": 0.4149377593360996,
"grad_norm": 5.626243591308594,
"learning_rate": 9.682969016701358e-05,
"loss": 1.9538,
"step": 50
},
{
"epoch": 0.4149377593360996,
"eval_loss": 1.9951374530792236,
"eval_runtime": 7.1371,
"eval_samples_per_second": 28.443,
"eval_steps_per_second": 7.146,
"step": 50
},
{
"epoch": 0.42323651452282157,
"grad_norm": 5.736331462860107,
"learning_rate": 9.66709978038292e-05,
"loss": 2.2465,
"step": 51
},
{
"epoch": 0.4315352697095436,
"grad_norm": 5.308639049530029,
"learning_rate": 9.650856666246693e-05,
"loss": 2.1472,
"step": 52
},
{
"epoch": 0.43983402489626555,
"grad_norm": 4.070306301116943,
"learning_rate": 9.634240975516209e-05,
"loss": 1.5106,
"step": 53
},
{
"epoch": 0.44813278008298757,
"grad_norm": 4.719001293182373,
"learning_rate": 9.617254039261834e-05,
"loss": 1.5839,
"step": 54
},
{
"epoch": 0.45643153526970953,
"grad_norm": 8.284409523010254,
"learning_rate": 9.599897218294122e-05,
"loss": 1.9048,
"step": 55
},
{
"epoch": 0.46473029045643155,
"grad_norm": 5.499932765960693,
"learning_rate": 9.582171903054816e-05,
"loss": 2.0164,
"step": 56
},
{
"epoch": 0.4730290456431535,
"grad_norm": 4.232677936553955,
"learning_rate": 9.564079513505455e-05,
"loss": 1.6968,
"step": 57
},
{
"epoch": 0.48132780082987553,
"grad_norm": 6.6102776527404785,
"learning_rate": 9.54562149901362e-05,
"loss": 2.1707,
"step": 58
},
{
"epoch": 0.4896265560165975,
"grad_norm": 5.499170303344727,
"learning_rate": 9.526799338236827e-05,
"loss": 2.1749,
"step": 59
},
{
"epoch": 0.4979253112033195,
"grad_norm": 6.410165309906006,
"learning_rate": 9.507614539004082e-05,
"loss": 2.0269,
"step": 60
},
{
"epoch": 0.5062240663900415,
"grad_norm": 4.395339488983154,
"learning_rate": 9.48806863819507e-05,
"loss": 1.5799,
"step": 61
},
{
"epoch": 0.5145228215767634,
"grad_norm": 5.214993953704834,
"learning_rate": 9.468163201617062e-05,
"loss": 1.4865,
"step": 62
},
{
"epoch": 0.5228215767634855,
"grad_norm": 7.040699005126953,
"learning_rate": 9.447899823879456e-05,
"loss": 2.0153,
"step": 63
},
{
"epoch": 0.5311203319502075,
"grad_norm": 5.472765922546387,
"learning_rate": 9.42728012826605e-05,
"loss": 1.9123,
"step": 64
},
{
"epoch": 0.5394190871369294,
"grad_norm": 5.13813591003418,
"learning_rate": 9.406305766604995e-05,
"loss": 1.8829,
"step": 65
},
{
"epoch": 0.5477178423236515,
"grad_norm": 5.717247486114502,
"learning_rate": 9.384978419136468e-05,
"loss": 2.0948,
"step": 66
},
{
"epoch": 0.5560165975103735,
"grad_norm": 5.990386962890625,
"learning_rate": 9.363299794378073e-05,
"loss": 1.8932,
"step": 67
},
{
"epoch": 0.5643153526970954,
"grad_norm": 7.660478591918945,
"learning_rate": 9.341271628987968e-05,
"loss": 2.3816,
"step": 68
},
{
"epoch": 0.5726141078838174,
"grad_norm": 4.8774027824401855,
"learning_rate": 9.318895687625751e-05,
"loss": 1.6324,
"step": 69
},
{
"epoch": 0.5809128630705395,
"grad_norm": 5.0475568771362305,
"learning_rate": 9.296173762811085e-05,
"loss": 1.5918,
"step": 70
},
{
"epoch": 0.5892116182572614,
"grad_norm": 4.739609718322754,
"learning_rate": 9.273107674780102e-05,
"loss": 1.3994,
"step": 71
},
{
"epoch": 0.5975103734439834,
"grad_norm": 6.396079063415527,
"learning_rate": 9.249699271339593e-05,
"loss": 1.8884,
"step": 72
},
{
"epoch": 0.6058091286307054,
"grad_norm": 6.702268600463867,
"learning_rate": 9.225950427718975e-05,
"loss": 2.0607,
"step": 73
},
{
"epoch": 0.6141078838174274,
"grad_norm": 4.683436393737793,
"learning_rate": 9.201863046420065e-05,
"loss": 1.458,
"step": 74
},
{
"epoch": 0.6224066390041494,
"grad_norm": 5.798123359680176,
"learning_rate": 9.177439057064683e-05,
"loss": 1.9928,
"step": 75
},
{
"epoch": 0.6307053941908713,
"grad_norm": 6.20752477645874,
"learning_rate": 9.152680416240059e-05,
"loss": 2.0453,
"step": 76
},
{
"epoch": 0.6390041493775933,
"grad_norm": 4.803196430206299,
"learning_rate": 9.1275891073421e-05,
"loss": 1.6836,
"step": 77
},
{
"epoch": 0.6473029045643154,
"grad_norm": 5.217916011810303,
"learning_rate": 9.102167140416503e-05,
"loss": 1.5624,
"step": 78
},
{
"epoch": 0.6556016597510373,
"grad_norm": 5.689030170440674,
"learning_rate": 9.076416551997721e-05,
"loss": 1.2365,
"step": 79
},
{
"epoch": 0.6639004149377593,
"grad_norm": 5.20029878616333,
"learning_rate": 9.050339404945833e-05,
"loss": 1.5095,
"step": 80
},
{
"epoch": 0.6721991701244814,
"grad_norm": 7.432442665100098,
"learning_rate": 9.023937788281278e-05,
"loss": 2.1264,
"step": 81
},
{
"epoch": 0.6804979253112033,
"grad_norm": 5.681693077087402,
"learning_rate": 8.997213817017507e-05,
"loss": 1.5059,
"step": 82
},
{
"epoch": 0.6887966804979253,
"grad_norm": 6.367238521575928,
"learning_rate": 8.970169631991556e-05,
"loss": 1.5183,
"step": 83
},
{
"epoch": 0.6970954356846473,
"grad_norm": 7.021786212921143,
"learning_rate": 8.942807399692543e-05,
"loss": 1.8657,
"step": 84
},
{
"epoch": 0.7053941908713693,
"grad_norm": 6.1124587059021,
"learning_rate": 8.915129312088112e-05,
"loss": 1.7342,
"step": 85
},
{
"epoch": 0.7136929460580913,
"grad_norm": 6.746445655822754,
"learning_rate": 8.88713758644883e-05,
"loss": 1.7254,
"step": 86
},
{
"epoch": 0.7219917012448133,
"grad_norm": 5.889866828918457,
"learning_rate": 8.858834465170576e-05,
"loss": 1.6482,
"step": 87
},
{
"epoch": 0.7302904564315352,
"grad_norm": 6.273972511291504,
"learning_rate": 8.83022221559489e-05,
"loss": 1.5594,
"step": 88
},
{
"epoch": 0.7385892116182573,
"grad_norm": 8.849343299865723,
"learning_rate": 8.801303129827352e-05,
"loss": 1.4801,
"step": 89
},
{
"epoch": 0.7468879668049793,
"grad_norm": 6.7875566482543945,
"learning_rate": 8.772079524553951e-05,
"loss": 1.6455,
"step": 90
},
{
"epoch": 0.7551867219917012,
"grad_norm": 5.853145599365234,
"learning_rate": 8.742553740855506e-05,
"loss": 1.3349,
"step": 91
},
{
"epoch": 0.7634854771784232,
"grad_norm": 7.308794021606445,
"learning_rate": 8.712728144020118e-05,
"loss": 1.8087,
"step": 92
},
{
"epoch": 0.7717842323651453,
"grad_norm": 7.031892776489258,
"learning_rate": 8.682605123353685e-05,
"loss": 1.3419,
"step": 93
},
{
"epoch": 0.7800829875518672,
"grad_norm": 6.652479648590088,
"learning_rate": 8.652187091988517e-05,
"loss": 1.645,
"step": 94
},
{
"epoch": 0.7883817427385892,
"grad_norm": 8.24259090423584,
"learning_rate": 8.621476486689992e-05,
"loss": 1.6243,
"step": 95
},
{
"epoch": 0.7966804979253111,
"grad_norm": 5.5381975173950195,
"learning_rate": 8.59047576766137e-05,
"loss": 1.282,
"step": 96
},
{
"epoch": 0.8049792531120332,
"grad_norm": 6.3423309326171875,
"learning_rate": 8.559187418346703e-05,
"loss": 1.3701,
"step": 97
},
{
"epoch": 0.8132780082987552,
"grad_norm": 6.963042736053467,
"learning_rate": 8.527613945231885e-05,
"loss": 1.1813,
"step": 98
},
{
"epoch": 0.8215767634854771,
"grad_norm": 8.486031532287598,
"learning_rate": 8.495757877643858e-05,
"loss": 1.5802,
"step": 99
},
{
"epoch": 0.8298755186721992,
"grad_norm": 8.076821327209473,
"learning_rate": 8.463621767547998e-05,
"loss": 1.6718,
"step": 100
},
{
"epoch": 0.8298755186721992,
"eval_loss": 1.3937488794326782,
"eval_runtime": 7.2083,
"eval_samples_per_second": 28.162,
"eval_steps_per_second": 7.075,
"step": 100
},
{
"epoch": 0.8381742738589212,
"grad_norm": 4.582700252532959,
"learning_rate": 8.43120818934367e-05,
"loss": 1.2412,
"step": 101
},
{
"epoch": 0.8464730290456431,
"grad_norm": 11.859997749328613,
"learning_rate": 8.398519739657996e-05,
"loss": 1.9913,
"step": 102
},
{
"epoch": 0.8547717842323651,
"grad_norm": 7.4468841552734375,
"learning_rate": 8.365559037137852e-05,
"loss": 1.9243,
"step": 103
},
{
"epoch": 0.8630705394190872,
"grad_norm": 7.206920146942139,
"learning_rate": 8.332328722240073e-05,
"loss": 1.7109,
"step": 104
},
{
"epoch": 0.8713692946058091,
"grad_norm": 6.83170223236084,
"learning_rate": 8.298831457019942e-05,
"loss": 1.3447,
"step": 105
},
{
"epoch": 0.8796680497925311,
"grad_norm": 7.21415376663208,
"learning_rate": 8.265069924917925e-05,
"loss": 1.4393,
"step": 106
},
{
"epoch": 0.8879668049792531,
"grad_norm": 9.88570785522461,
"learning_rate": 8.231046830544716e-05,
"loss": 2.0253,
"step": 107
},
{
"epoch": 0.8962655601659751,
"grad_norm": 6.444344520568848,
"learning_rate": 8.196764899464551e-05,
"loss": 1.3066,
"step": 108
},
{
"epoch": 0.9045643153526971,
"grad_norm": 5.2003326416015625,
"learning_rate": 8.162226877976887e-05,
"loss": 0.9973,
"step": 109
},
{
"epoch": 0.9128630705394191,
"grad_norm": 5.82247257232666,
"learning_rate": 8.127435532896388e-05,
"loss": 1.2795,
"step": 110
},
{
"epoch": 0.921161825726141,
"grad_norm": 8.667819023132324,
"learning_rate": 8.092393651331275e-05,
"loss": 1.9045,
"step": 111
},
{
"epoch": 0.9294605809128631,
"grad_norm": 7.90700626373291,
"learning_rate": 8.057104040460062e-05,
"loss": 1.3316,
"step": 112
},
{
"epoch": 0.9377593360995851,
"grad_norm": 5.512808322906494,
"learning_rate": 8.021569527306662e-05,
"loss": 1.0711,
"step": 113
},
{
"epoch": 0.946058091286307,
"grad_norm": 8.198836326599121,
"learning_rate": 7.985792958513931e-05,
"loss": 1.7327,
"step": 114
},
{
"epoch": 0.9543568464730291,
"grad_norm": 9.416217803955078,
"learning_rate": 7.949777200115616e-05,
"loss": 1.6381,
"step": 115
},
{
"epoch": 0.9626556016597511,
"grad_norm": 7.902495384216309,
"learning_rate": 7.913525137306756e-05,
"loss": 1.4719,
"step": 116
},
{
"epoch": 0.970954356846473,
"grad_norm": 6.16163444519043,
"learning_rate": 7.877039674212569e-05,
"loss": 1.1846,
"step": 117
},
{
"epoch": 0.979253112033195,
"grad_norm": 7.932492256164551,
"learning_rate": 7.840323733655778e-05,
"loss": 1.3338,
"step": 118
},
{
"epoch": 0.9875518672199171,
"grad_norm": 6.864604949951172,
"learning_rate": 7.803380256922495e-05,
"loss": 1.2608,
"step": 119
},
{
"epoch": 0.995850622406639,
"grad_norm": 8.558213233947754,
"learning_rate": 7.76621220352657e-05,
"loss": 1.271,
"step": 120
},
{
"epoch": 1.0062240663900415,
"grad_norm": 14.87498664855957,
"learning_rate": 7.728822550972523e-05,
"loss": 1.9047,
"step": 121
},
{
"epoch": 1.0145228215767634,
"grad_norm": 4.546281337738037,
"learning_rate": 7.69121429451702e-05,
"loss": 0.6296,
"step": 122
},
{
"epoch": 1.0228215767634854,
"grad_norm": 5.022426128387451,
"learning_rate": 7.653390446928909e-05,
"loss": 0.7982,
"step": 123
},
{
"epoch": 1.0311203319502074,
"grad_norm": 6.32510232925415,
"learning_rate": 7.615354038247888e-05,
"loss": 0.895,
"step": 124
},
{
"epoch": 1.0394190871369295,
"grad_norm": 5.873676300048828,
"learning_rate": 7.577108115541761e-05,
"loss": 0.7959,
"step": 125
},
{
"epoch": 1.0477178423236515,
"grad_norm": 6.671182632446289,
"learning_rate": 7.53865574266234e-05,
"loss": 0.7174,
"step": 126
},
{
"epoch": 1.0560165975103735,
"grad_norm": 5.916552543640137,
"learning_rate": 7.500000000000001e-05,
"loss": 0.516,
"step": 127
},
{
"epoch": 1.0643153526970954,
"grad_norm": 5.885247230529785,
"learning_rate": 7.461143984236924e-05,
"loss": 0.5581,
"step": 128
},
{
"epoch": 1.0726141078838174,
"grad_norm": 5.941064834594727,
"learning_rate": 7.422090808099014e-05,
"loss": 0.5006,
"step": 129
},
{
"epoch": 1.0809128630705394,
"grad_norm": 6.601113319396973,
"learning_rate": 7.38284360010654e-05,
"loss": 0.8121,
"step": 130
},
{
"epoch": 1.0892116182572613,
"grad_norm": 6.470757484436035,
"learning_rate": 7.343405504323519e-05,
"loss": 0.7346,
"step": 131
},
{
"epoch": 1.0975103734439835,
"grad_norm": 5.690014839172363,
"learning_rate": 7.303779680105843e-05,
"loss": 0.5846,
"step": 132
},
{
"epoch": 1.1058091286307055,
"grad_norm": 7.497930526733398,
"learning_rate": 7.263969301848188e-05,
"loss": 1.1355,
"step": 133
},
{
"epoch": 1.1141078838174274,
"grad_norm": 5.633259296417236,
"learning_rate": 7.223977558729706e-05,
"loss": 0.4784,
"step": 134
},
{
"epoch": 1.1224066390041494,
"grad_norm": 5.755221843719482,
"learning_rate": 7.183807654458564e-05,
"loss": 0.6109,
"step": 135
},
{
"epoch": 1.1307053941908713,
"grad_norm": 5.960432052612305,
"learning_rate": 7.143462807015271e-05,
"loss": 0.7443,
"step": 136
},
{
"epoch": 1.1390041493775933,
"grad_norm": 6.3896403312683105,
"learning_rate": 7.102946248394909e-05,
"loss": 0.7038,
"step": 137
},
{
"epoch": 1.1473029045643153,
"grad_norm": 4.847652435302734,
"learning_rate": 7.062261224348203e-05,
"loss": 0.4339,
"step": 138
},
{
"epoch": 1.1556016597510372,
"grad_norm": 3.1685574054718018,
"learning_rate": 7.021410994121525e-05,
"loss": 0.2151,
"step": 139
},
{
"epoch": 1.1639004149377594,
"grad_norm": 4.961888790130615,
"learning_rate": 6.980398830195785e-05,
"loss": 0.4,
"step": 140
},
{
"epoch": 1.1721991701244814,
"grad_norm": 5.239101886749268,
"learning_rate": 6.939228018024275e-05,
"loss": 0.6228,
"step": 141
},
{
"epoch": 1.1804979253112033,
"grad_norm": 5.622098922729492,
"learning_rate": 6.897901855769483e-05,
"loss": 0.3817,
"step": 142
},
{
"epoch": 1.1887966804979253,
"grad_norm": 4.465243816375732,
"learning_rate": 6.856423654038868e-05,
"loss": 0.4005,
"step": 143
},
{
"epoch": 1.1970954356846473,
"grad_norm": 6.165435791015625,
"learning_rate": 6.814796735619663e-05,
"loss": 0.4155,
"step": 144
},
{
"epoch": 1.2053941908713692,
"grad_norm": 8.859424591064453,
"learning_rate": 6.773024435212678e-05,
"loss": 0.8074,
"step": 145
},
{
"epoch": 1.2136929460580912,
"grad_norm": 6.36492919921875,
"learning_rate": 6.731110099165164e-05,
"loss": 0.4823,
"step": 146
},
{
"epoch": 1.2219917012448134,
"grad_norm": 5.6012678146362305,
"learning_rate": 6.689057085202737e-05,
"loss": 0.4968,
"step": 147
},
{
"epoch": 1.2302904564315353,
"grad_norm": 7.916935920715332,
"learning_rate": 6.646868762160399e-05,
"loss": 0.5695,
"step": 148
},
{
"epoch": 1.2385892116182573,
"grad_norm": 5.010058403015137,
"learning_rate": 6.604548509712658e-05,
"loss": 0.3488,
"step": 149
},
{
"epoch": 1.2468879668049793,
"grad_norm": 9.163707733154297,
"learning_rate": 6.562099718102788e-05,
"loss": 0.6332,
"step": 150
},
{
"epoch": 1.2468879668049793,
"eval_loss": 0.9936810731887817,
"eval_runtime": 7.1349,
"eval_samples_per_second": 28.452,
"eval_steps_per_second": 7.148,
"step": 150
},
{
"epoch": 1.2551867219917012,
"grad_norm": 7.536319255828857,
"learning_rate": 6.519525787871235e-05,
"loss": 0.6467,
"step": 151
},
{
"epoch": 1.2634854771784232,
"grad_norm": 7.516266345977783,
"learning_rate": 6.476830129583206e-05,
"loss": 0.5713,
"step": 152
},
{
"epoch": 1.2717842323651452,
"grad_norm": 7.401801109313965,
"learning_rate": 6.434016163555452e-05,
"loss": 0.6164,
"step": 153
},
{
"epoch": 1.2800829875518671,
"grad_norm": 6.253715515136719,
"learning_rate": 6.391087319582264e-05,
"loss": 0.5391,
"step": 154
},
{
"epoch": 1.288381742738589,
"grad_norm": 6.459432125091553,
"learning_rate": 6.34804703666072e-05,
"loss": 0.3725,
"step": 155
},
{
"epoch": 1.2966804979253113,
"grad_norm": 8.033556938171387,
"learning_rate": 6.304898762715186e-05,
"loss": 0.578,
"step": 156
},
{
"epoch": 1.3049792531120332,
"grad_norm": 6.300682067871094,
"learning_rate": 6.261645954321109e-05,
"loss": 0.4584,
"step": 157
},
{
"epoch": 1.3132780082987552,
"grad_norm": 6.387180328369141,
"learning_rate": 6.21829207642811e-05,
"loss": 0.4923,
"step": 158
},
{
"epoch": 1.3215767634854771,
"grad_norm": 7.160585880279541,
"learning_rate": 6.174840602082412e-05,
"loss": 0.5705,
"step": 159
},
{
"epoch": 1.329875518672199,
"grad_norm": 6.173452854156494,
"learning_rate": 6.131295012148612e-05,
"loss": 0.4962,
"step": 160
},
{
"epoch": 1.3381742738589213,
"grad_norm": 4.847928047180176,
"learning_rate": 6.087658795030837e-05,
"loss": 0.3703,
"step": 161
},
{
"epoch": 1.3464730290456433,
"grad_norm": 7.544972896575928,
"learning_rate": 6.043935446393294e-05,
"loss": 0.433,
"step": 162
},
{
"epoch": 1.3547717842323652,
"grad_norm": 6.9235358238220215,
"learning_rate": 6.0001284688802226e-05,
"loss": 0.6781,
"step": 163
},
{
"epoch": 1.3630705394190872,
"grad_norm": 7.10660457611084,
"learning_rate": 5.956241371835312e-05,
"loss": 0.5983,
"step": 164
},
{
"epoch": 1.3713692946058091,
"grad_norm": 4.3860955238342285,
"learning_rate": 5.912277671020564e-05,
"loss": 0.3219,
"step": 165
},
{
"epoch": 1.379668049792531,
"grad_norm": 6.816618919372559,
"learning_rate": 5.868240888334653e-05,
"loss": 0.4972,
"step": 166
},
{
"epoch": 1.387966804979253,
"grad_norm": 5.665396690368652,
"learning_rate": 5.824134551530783e-05,
"loss": 0.4513,
"step": 167
},
{
"epoch": 1.396265560165975,
"grad_norm": 8.053374290466309,
"learning_rate": 5.7799621939340896e-05,
"loss": 0.3688,
"step": 168
},
{
"epoch": 1.404564315352697,
"grad_norm": 5.979791164398193,
"learning_rate": 5.735727354158581e-05,
"loss": 0.3598,
"step": 169
},
{
"epoch": 1.412863070539419,
"grad_norm": 4.926863193511963,
"learning_rate": 5.691433575823666e-05,
"loss": 0.3195,
"step": 170
},
{
"epoch": 1.4211618257261411,
"grad_norm": 6.836885452270508,
"learning_rate": 5.6470844072702764e-05,
"loss": 0.6218,
"step": 171
},
{
"epoch": 1.429460580912863,
"grad_norm": 7.920914649963379,
"learning_rate": 5.602683401276615e-05,
"loss": 0.4448,
"step": 172
},
{
"epoch": 1.437759336099585,
"grad_norm": 3.9926419258117676,
"learning_rate": 5.55823411477354e-05,
"loss": 0.208,
"step": 173
},
{
"epoch": 1.446058091286307,
"grad_norm": 5.798760414123535,
"learning_rate": 5.513740108559622e-05,
"loss": 0.3248,
"step": 174
},
{
"epoch": 1.454356846473029,
"grad_norm": 7.048202991485596,
"learning_rate": 5.469204947015897e-05,
"loss": 0.4915,
"step": 175
},
{
"epoch": 1.4626556016597512,
"grad_norm": 4.450428485870361,
"learning_rate": 5.424632197820324e-05,
"loss": 0.2565,
"step": 176
},
{
"epoch": 1.4709543568464731,
"grad_norm": 7.559440612792969,
"learning_rate": 5.3800254316619806e-05,
"loss": 0.5286,
"step": 177
},
{
"epoch": 1.479253112033195,
"grad_norm": 7.2518086433410645,
"learning_rate": 5.335388221955012e-05,
"loss": 0.5451,
"step": 178
},
{
"epoch": 1.487551867219917,
"grad_norm": 5.655428886413574,
"learning_rate": 5.290724144552379e-05,
"loss": 0.3642,
"step": 179
},
{
"epoch": 1.495850622406639,
"grad_norm": 6.303919792175293,
"learning_rate": 5.246036777459391e-05,
"loss": 0.4304,
"step": 180
},
{
"epoch": 1.504149377593361,
"grad_norm": 9.382453918457031,
"learning_rate": 5.201329700547076e-05,
"loss": 0.5932,
"step": 181
},
{
"epoch": 1.512448132780083,
"grad_norm": 5.657342910766602,
"learning_rate": 5.1566064952654014e-05,
"loss": 0.431,
"step": 182
},
{
"epoch": 1.520746887966805,
"grad_norm": 5.608704566955566,
"learning_rate": 5.1118707443563665e-05,
"loss": 0.4258,
"step": 183
},
{
"epoch": 1.5290456431535269,
"grad_norm": 6.01896333694458,
"learning_rate": 5.0671260315669875e-05,
"loss": 0.3907,
"step": 184
},
{
"epoch": 1.5373443983402488,
"grad_norm": 8.329726219177246,
"learning_rate": 5.022375941362217e-05,
"loss": 0.3328,
"step": 185
},
{
"epoch": 1.5456431535269708,
"grad_norm": 6.260307788848877,
"learning_rate": 4.977624058637783e-05,
"loss": 0.3374,
"step": 186
},
{
"epoch": 1.553941908713693,
"grad_norm": 6.407407760620117,
"learning_rate": 4.9328739684330137e-05,
"loss": 0.432,
"step": 187
},
{
"epoch": 1.562240663900415,
"grad_norm": 5.673023700714111,
"learning_rate": 4.8881292556436354e-05,
"loss": 0.2581,
"step": 188
},
{
"epoch": 1.570539419087137,
"grad_norm": 6.290989875793457,
"learning_rate": 4.8433935047346e-05,
"loss": 0.387,
"step": 189
},
{
"epoch": 1.578838174273859,
"grad_norm": 5.269549369812012,
"learning_rate": 4.798670299452926e-05,
"loss": 0.3035,
"step": 190
},
{
"epoch": 1.587136929460581,
"grad_norm": 7.64688777923584,
"learning_rate": 4.7539632225406095e-05,
"loss": 0.3908,
"step": 191
},
{
"epoch": 1.595435684647303,
"grad_norm": 11.037405014038086,
"learning_rate": 4.709275855447621e-05,
"loss": 0.8535,
"step": 192
},
{
"epoch": 1.603734439834025,
"grad_norm": 5.834881782531738,
"learning_rate": 4.6646117780449876e-05,
"loss": 0.2963,
"step": 193
},
{
"epoch": 1.612033195020747,
"grad_norm": 5.2544403076171875,
"learning_rate": 4.6199745683380206e-05,
"loss": 0.2959,
"step": 194
},
{
"epoch": 1.620331950207469,
"grad_norm": 8.141179084777832,
"learning_rate": 4.5753678021796755e-05,
"loss": 0.634,
"step": 195
},
{
"epoch": 1.6286307053941909,
"grad_norm": 4.7402191162109375,
"learning_rate": 4.530795052984104e-05,
"loss": 0.2887,
"step": 196
},
{
"epoch": 1.6369294605809128,
"grad_norm": 6.756740093231201,
"learning_rate": 4.48625989144038e-05,
"loss": 0.4239,
"step": 197
},
{
"epoch": 1.6452282157676348,
"grad_norm": 6.466264247894287,
"learning_rate": 4.4417658852264614e-05,
"loss": 0.2381,
"step": 198
},
{
"epoch": 1.6535269709543567,
"grad_norm": 5.09955358505249,
"learning_rate": 4.397316598723385e-05,
"loss": 0.3166,
"step": 199
},
{
"epoch": 1.6618257261410787,
"grad_norm": 5.525445461273193,
"learning_rate": 4.352915592729723e-05,
"loss": 0.3016,
"step": 200
},
{
"epoch": 1.6618257261410787,
"eval_loss": 0.6099563241004944,
"eval_runtime": 7.1625,
"eval_samples_per_second": 28.342,
"eval_steps_per_second": 7.12,
"step": 200
},
{
"epoch": 1.6701244813278007,
"grad_norm": 7.207304954528809,
"learning_rate": 4.308566424176336e-05,
"loss": 0.5757,
"step": 201
},
{
"epoch": 1.6784232365145229,
"grad_norm": 8.592957496643066,
"learning_rate": 4.264272645841419e-05,
"loss": 0.487,
"step": 202
},
{
"epoch": 1.6867219917012448,
"grad_norm": 4.523548603057861,
"learning_rate": 4.2200378060659116e-05,
"loss": 0.1985,
"step": 203
},
{
"epoch": 1.6950207468879668,
"grad_norm": 3.577657699584961,
"learning_rate": 4.1758654484692186e-05,
"loss": 0.217,
"step": 204
},
{
"epoch": 1.703319502074689,
"grad_norm": 5.221662998199463,
"learning_rate": 4.131759111665349e-05,
"loss": 0.3242,
"step": 205
},
{
"epoch": 1.711618257261411,
"grad_norm": 4.403975009918213,
"learning_rate": 4.087722328979438e-05,
"loss": 0.2363,
"step": 206
},
{
"epoch": 1.7199170124481329,
"grad_norm": 4.955993175506592,
"learning_rate": 4.043758628164688e-05,
"loss": 0.4589,
"step": 207
},
{
"epoch": 1.7282157676348548,
"grad_norm": 5.290609836578369,
"learning_rate": 3.9998715311197785e-05,
"loss": 0.304,
"step": 208
},
{
"epoch": 1.7365145228215768,
"grad_norm": 4.659958362579346,
"learning_rate": 3.956064553606708e-05,
"loss": 0.1433,
"step": 209
},
{
"epoch": 1.7448132780082988,
"grad_norm": 6.113132953643799,
"learning_rate": 3.912341204969164e-05,
"loss": 0.2607,
"step": 210
},
{
"epoch": 1.7531120331950207,
"grad_norm": 7.161942005157471,
"learning_rate": 3.86870498785139e-05,
"loss": 0.5274,
"step": 211
},
{
"epoch": 1.7614107883817427,
"grad_norm": 7.1513671875,
"learning_rate": 3.825159397917589e-05,
"loss": 0.4634,
"step": 212
},
{
"epoch": 1.7697095435684647,
"grad_norm": 5.32570219039917,
"learning_rate": 3.781707923571891e-05,
"loss": 0.1875,
"step": 213
},
{
"epoch": 1.7780082987551866,
"grad_norm": 4.187331199645996,
"learning_rate": 3.738354045678891e-05,
"loss": 0.1516,
"step": 214
},
{
"epoch": 1.7863070539419086,
"grad_norm": 7.296047687530518,
"learning_rate": 3.695101237284815e-05,
"loss": 0.3847,
"step": 215
},
{
"epoch": 1.7946058091286305,
"grad_norm": 5.951739311218262,
"learning_rate": 3.651952963339282e-05,
"loss": 0.2656,
"step": 216
},
{
"epoch": 1.8029045643153527,
"grad_norm": 6.873353958129883,
"learning_rate": 3.608912680417737e-05,
"loss": 0.165,
"step": 217
},
{
"epoch": 1.8112033195020747,
"grad_norm": 6.497419834136963,
"learning_rate": 3.5659838364445505e-05,
"loss": 0.2504,
"step": 218
},
{
"epoch": 1.8195020746887967,
"grad_norm": 7.605969429016113,
"learning_rate": 3.523169870416795e-05,
"loss": 0.3794,
"step": 219
},
{
"epoch": 1.8278008298755186,
"grad_norm": 7.117875576019287,
"learning_rate": 3.480474212128766e-05,
"loss": 0.3687,
"step": 220
},
{
"epoch": 1.8360995850622408,
"grad_norm": 4.442015647888184,
"learning_rate": 3.4379002818972124e-05,
"loss": 0.2042,
"step": 221
},
{
"epoch": 1.8443983402489628,
"grad_norm": 8.238639831542969,
"learning_rate": 3.3954514902873425e-05,
"loss": 0.5648,
"step": 222
},
{
"epoch": 1.8526970954356847,
"grad_norm": 7.620614051818848,
"learning_rate": 3.3531312378396026e-05,
"loss": 0.3973,
"step": 223
},
{
"epoch": 1.8609958506224067,
"grad_norm": 5.214080333709717,
"learning_rate": 3.310942914797265e-05,
"loss": 0.2133,
"step": 224
},
{
"epoch": 1.8692946058091287,
"grad_norm": 5.911538600921631,
"learning_rate": 3.2688899008348386e-05,
"loss": 0.219,
"step": 225
},
{
"epoch": 1.8775933609958506,
"grad_norm": 10.658268928527832,
"learning_rate": 3.226975564787322e-05,
"loss": 0.338,
"step": 226
},
{
"epoch": 1.8858921161825726,
"grad_norm": 6.180187702178955,
"learning_rate": 3.185203264380338e-05,
"loss": 0.2234,
"step": 227
},
{
"epoch": 1.8941908713692945,
"grad_norm": 5.702350616455078,
"learning_rate": 3.143576345961132e-05,
"loss": 0.2215,
"step": 228
},
{
"epoch": 1.9024896265560165,
"grad_norm": 2.7787413597106934,
"learning_rate": 3.1020981442305184e-05,
"loss": 0.0856,
"step": 229
},
{
"epoch": 1.9107883817427385,
"grad_norm": 5.721746921539307,
"learning_rate": 3.060771981975726e-05,
"loss": 0.3339,
"step": 230
},
{
"epoch": 1.9190871369294604,
"grad_norm": 6.0323567390441895,
"learning_rate": 3.019601169804216e-05,
"loss": 0.1757,
"step": 231
},
{
"epoch": 1.9273858921161826,
"grad_norm": 8.973467826843262,
"learning_rate": 2.978589005878476e-05,
"loss": 0.262,
"step": 232
},
{
"epoch": 1.9356846473029046,
"grad_norm": 3.4507062435150146,
"learning_rate": 2.9377387756517982e-05,
"loss": 0.1235,
"step": 233
},
{
"epoch": 1.9439834024896265,
"grad_norm": 5.919799327850342,
"learning_rate": 2.897053751605093e-05,
"loss": 0.2862,
"step": 234
},
{
"epoch": 1.9522821576763485,
"grad_norm": 4.9692487716674805,
"learning_rate": 2.8565371929847284e-05,
"loss": 0.1764,
"step": 235
},
{
"epoch": 1.9605809128630707,
"grad_norm": 5.740837574005127,
"learning_rate": 2.8161923455414367e-05,
"loss": 0.1705,
"step": 236
},
{
"epoch": 1.9688796680497926,
"grad_norm": 5.595933437347412,
"learning_rate": 2.776022441270295e-05,
"loss": 0.2613,
"step": 237
},
{
"epoch": 1.9771784232365146,
"grad_norm": 4.591209888458252,
"learning_rate": 2.7360306981518146e-05,
"loss": 0.1906,
"step": 238
},
{
"epoch": 1.9854771784232366,
"grad_norm": 7.699284553527832,
"learning_rate": 2.6962203198941587e-05,
"loss": 0.4312,
"step": 239
},
{
"epoch": 1.9937759336099585,
"grad_norm": 6.995236873626709,
"learning_rate": 2.656594495676482e-05,
"loss": 0.3351,
"step": 240
},
{
"epoch": 2.004149377593361,
"grad_norm": 6.956718921661377,
"learning_rate": 2.6171563998934605e-05,
"loss": 0.2213,
"step": 241
},
{
"epoch": 2.012448132780083,
"grad_norm": 3.839200258255005,
"learning_rate": 2.5779091919009877e-05,
"loss": 0.2139,
"step": 242
},
{
"epoch": 2.020746887966805,
"grad_norm": 1.65491783618927,
"learning_rate": 2.5388560157630765e-05,
"loss": 0.0616,
"step": 243
},
{
"epoch": 2.029045643153527,
"grad_norm": 3.041247606277466,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0922,
"step": 244
},
{
"epoch": 2.037344398340249,
"grad_norm": 1.837149739265442,
"learning_rate": 2.461344257337662e-05,
"loss": 0.0552,
"step": 245
},
{
"epoch": 2.045643153526971,
"grad_norm": 3.2361204624176025,
"learning_rate": 2.422891884458241e-05,
"loss": 0.0536,
"step": 246
},
{
"epoch": 2.0539419087136928,
"grad_norm": 4.026633262634277,
"learning_rate": 2.3846459617521128e-05,
"loss": 0.2692,
"step": 247
},
{
"epoch": 2.0622406639004147,
"grad_norm": 3.3415215015411377,
"learning_rate": 2.346609553071093e-05,
"loss": 0.0902,
"step": 248
},
{
"epoch": 2.070539419087137,
"grad_norm": 0.9877287745475769,
"learning_rate": 2.308785705482982e-05,
"loss": 0.0222,
"step": 249
},
{
"epoch": 2.078838174273859,
"grad_norm": 1.6253547668457031,
"learning_rate": 2.2711774490274766e-05,
"loss": 0.0311,
"step": 250
},
{
"epoch": 2.078838174273859,
"eval_loss": 0.4097523093223572,
"eval_runtime": 7.1697,
"eval_samples_per_second": 28.314,
"eval_steps_per_second": 7.113,
"step": 250
},
{
"epoch": 2.087136929460581,
"grad_norm": 4.143763065338135,
"learning_rate": 2.233787796473432e-05,
"loss": 0.0556,
"step": 251
},
{
"epoch": 2.095435684647303,
"grad_norm": 4.3261847496032715,
"learning_rate": 2.1966197430775053e-05,
"loss": 0.1249,
"step": 252
},
{
"epoch": 2.103734439834025,
"grad_norm": 4.310375690460205,
"learning_rate": 2.1596762663442218e-05,
"loss": 0.0994,
"step": 253
},
{
"epoch": 2.112033195020747,
"grad_norm": 1.23174250125885,
"learning_rate": 2.122960325787432e-05,
"loss": 0.0315,
"step": 254
},
{
"epoch": 2.120331950207469,
"grad_norm": 1.8426965475082397,
"learning_rate": 2.086474862693244e-05,
"loss": 0.0491,
"step": 255
},
{
"epoch": 2.128630705394191,
"grad_norm": 3.062448501586914,
"learning_rate": 2.050222799884387e-05,
"loss": 0.0968,
"step": 256
},
{
"epoch": 2.136929460580913,
"grad_norm": 2.411597967147827,
"learning_rate": 2.0142070414860704e-05,
"loss": 0.0283,
"step": 257
},
{
"epoch": 2.145228215767635,
"grad_norm": 2.3557469844818115,
"learning_rate": 1.9784304726933383e-05,
"loss": 0.0897,
"step": 258
},
{
"epoch": 2.1535269709543567,
"grad_norm": 8.79699420928955,
"learning_rate": 1.942895959539939e-05,
"loss": 0.0328,
"step": 259
},
{
"epoch": 2.1618257261410787,
"grad_norm": 1.4276204109191895,
"learning_rate": 1.9076063486687256e-05,
"loss": 0.016,
"step": 260
},
{
"epoch": 2.1701244813278007,
"grad_norm": 2.912771701812744,
"learning_rate": 1.8725644671036126e-05,
"loss": 0.1233,
"step": 261
},
{
"epoch": 2.1784232365145226,
"grad_norm": 4.887458801269531,
"learning_rate": 1.837773122023114e-05,
"loss": 0.0717,
"step": 262
},
{
"epoch": 2.186721991701245,
"grad_norm": 1.242602825164795,
"learning_rate": 1.803235100535452e-05,
"loss": 0.0208,
"step": 263
},
{
"epoch": 2.195020746887967,
"grad_norm": 0.9755971431732178,
"learning_rate": 1.7689531694552863e-05,
"loss": 0.0192,
"step": 264
},
{
"epoch": 2.203319502074689,
"grad_norm": 2.5533509254455566,
"learning_rate": 1.734930075082076e-05,
"loss": 0.0384,
"step": 265
},
{
"epoch": 2.211618257261411,
"grad_norm": 0.9231969118118286,
"learning_rate": 1.7011685429800595e-05,
"loss": 0.0152,
"step": 266
},
{
"epoch": 2.219917012448133,
"grad_norm": 1.7765648365020752,
"learning_rate": 1.6676712777599273e-05,
"loss": 0.041,
"step": 267
},
{
"epoch": 2.228215767634855,
"grad_norm": 2.0227739810943604,
"learning_rate": 1.6344409628621484e-05,
"loss": 0.1056,
"step": 268
},
{
"epoch": 2.236514522821577,
"grad_norm": 1.8746280670166016,
"learning_rate": 1.6014802603420044e-05,
"loss": 0.0542,
"step": 269
},
{
"epoch": 2.2448132780082988,
"grad_norm": 1.8291066884994507,
"learning_rate": 1.5687918106563326e-05,
"loss": 0.0206,
"step": 270
},
{
"epoch": 2.2531120331950207,
"grad_norm": 7.360498905181885,
"learning_rate": 1.536378232452003e-05,
"loss": 0.1436,
"step": 271
},
{
"epoch": 2.2614107883817427,
"grad_norm": 4.359367847442627,
"learning_rate": 1.5042421223561432e-05,
"loss": 0.075,
"step": 272
},
{
"epoch": 2.2697095435684647,
"grad_norm": 1.2709429264068604,
"learning_rate": 1.4723860547681162e-05,
"loss": 0.0172,
"step": 273
},
{
"epoch": 2.2780082987551866,
"grad_norm": 3.57598876953125,
"learning_rate": 1.440812581653298e-05,
"loss": 0.0619,
"step": 274
},
{
"epoch": 2.2863070539419086,
"grad_norm": 1.036085605621338,
"learning_rate": 1.4095242323386303e-05,
"loss": 0.0604,
"step": 275
},
{
"epoch": 2.2946058091286305,
"grad_norm": 2.2307868003845215,
"learning_rate": 1.3785235133100088e-05,
"loss": 0.037,
"step": 276
},
{
"epoch": 2.3029045643153525,
"grad_norm": 1.6419095993041992,
"learning_rate": 1.3478129080114848e-05,
"loss": 0.0735,
"step": 277
},
{
"epoch": 2.3112033195020745,
"grad_norm": 2.744889259338379,
"learning_rate": 1.3173948766463145e-05,
"loss": 0.0294,
"step": 278
},
{
"epoch": 2.3195020746887964,
"grad_norm": 3.0023012161254883,
"learning_rate": 1.2872718559798853e-05,
"loss": 0.0446,
"step": 279
},
{
"epoch": 2.327800829875519,
"grad_norm": 4.369894027709961,
"learning_rate": 1.257446259144494e-05,
"loss": 0.0467,
"step": 280
},
{
"epoch": 2.336099585062241,
"grad_norm": 1.3997019529342651,
"learning_rate": 1.2279204754460493e-05,
"loss": 0.0205,
"step": 281
},
{
"epoch": 2.3443983402489628,
"grad_norm": 3.3227059841156006,
"learning_rate": 1.1986968701726491e-05,
"loss": 0.083,
"step": 282
},
{
"epoch": 2.3526970954356847,
"grad_norm": 2.5781474113464355,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.0682,
"step": 283
},
{
"epoch": 2.3609958506224067,
"grad_norm": 0.8888638019561768,
"learning_rate": 1.1411655348294247e-05,
"loss": 0.0169,
"step": 284
},
{
"epoch": 2.3692946058091287,
"grad_norm": 3.076343059539795,
"learning_rate": 1.1128624135511712e-05,
"loss": 0.0746,
"step": 285
},
{
"epoch": 2.3775933609958506,
"grad_norm": 2.557579755783081,
"learning_rate": 1.0848706879118892e-05,
"loss": 0.0715,
"step": 286
},
{
"epoch": 2.3858921161825726,
"grad_norm": 3.306203842163086,
"learning_rate": 1.0571926003074561e-05,
"loss": 0.063,
"step": 287
},
{
"epoch": 2.3941908713692945,
"grad_norm": 1.7703243494033813,
"learning_rate": 1.0298303680084448e-05,
"loss": 0.0303,
"step": 288
},
{
"epoch": 2.4024896265560165,
"grad_norm": 3.1793651580810547,
"learning_rate": 1.0027861829824952e-05,
"loss": 0.0253,
"step": 289
},
{
"epoch": 2.4107883817427385,
"grad_norm": 2.1019341945648193,
"learning_rate": 9.760622117187235e-06,
"loss": 0.0759,
"step": 290
},
{
"epoch": 2.4190871369294604,
"grad_norm": 1.9458683729171753,
"learning_rate": 9.496605950541676e-06,
"loss": 0.0259,
"step": 291
},
{
"epoch": 2.4273858921161824,
"grad_norm": 3.2663235664367676,
"learning_rate": 9.235834480022787e-06,
"loss": 0.071,
"step": 292
},
{
"epoch": 2.435684647302905,
"grad_norm": 2.9131743907928467,
"learning_rate": 8.978328595834984e-06,
"loss": 0.0265,
"step": 293
},
{
"epoch": 2.4439834024896268,
"grad_norm": 4.122836589813232,
"learning_rate": 8.724108926578999e-06,
"loss": 0.0449,
"step": 294
},
{
"epoch": 2.4522821576763487,
"grad_norm": 2.857977867126465,
"learning_rate": 8.473195837599418e-06,
"loss": 0.0248,
"step": 295
},
{
"epoch": 2.4605809128630707,
"grad_norm": 4.170780181884766,
"learning_rate": 8.225609429353187e-06,
"loss": 0.0373,
"step": 296
},
{
"epoch": 2.4688796680497926,
"grad_norm": 2.490182638168335,
"learning_rate": 7.981369535799354e-06,
"loss": 0.0751,
"step": 297
},
{
"epoch": 2.4771784232365146,
"grad_norm": 3.713991165161133,
"learning_rate": 7.740495722810271e-06,
"loss": 0.0964,
"step": 298
},
{
"epoch": 2.4854771784232366,
"grad_norm": 3.0123841762542725,
"learning_rate": 7.503007286604069e-06,
"loss": 0.0414,
"step": 299
},
{
"epoch": 2.4937759336099585,
"grad_norm": 4.080419063568115,
"learning_rate": 7.268923252198989e-06,
"loss": 0.1049,
"step": 300
},
{
"epoch": 2.4937759336099585,
"eval_loss": 0.3370027542114258,
"eval_runtime": 7.1786,
"eval_samples_per_second": 28.278,
"eval_steps_per_second": 7.104,
"step": 300
},
{
"epoch": 2.5020746887966805,
"grad_norm": 5.102431774139404,
"learning_rate": 7.038262371889159e-06,
"loss": 0.1422,
"step": 301
},
{
"epoch": 2.5103734439834025,
"grad_norm": 1.8823308944702148,
"learning_rate": 6.811043123742494e-06,
"loss": 0.0404,
"step": 302
},
{
"epoch": 2.5186721991701244,
"grad_norm": 1.6759247779846191,
"learning_rate": 6.587283710120324e-06,
"loss": 0.0273,
"step": 303
},
{
"epoch": 2.5269709543568464,
"grad_norm": 1.534498929977417,
"learning_rate": 6.367002056219284e-06,
"loss": 0.0274,
"step": 304
},
{
"epoch": 2.5352697095435683,
"grad_norm": 1.8221664428710938,
"learning_rate": 6.150215808635335e-06,
"loss": 0.1055,
"step": 305
},
{
"epoch": 2.5435684647302903,
"grad_norm": 2.2624080181121826,
"learning_rate": 5.936942333950063e-06,
"loss": 0.0522,
"step": 306
},
{
"epoch": 2.5518672199170123,
"grad_norm": 1.1444363594055176,
"learning_rate": 5.727198717339511e-06,
"loss": 0.0714,
"step": 307
},
{
"epoch": 2.5601659751037342,
"grad_norm": 1.0299466848373413,
"learning_rate": 5.521001761205441e-06,
"loss": 0.017,
"step": 308
},
{
"epoch": 2.568464730290456,
"grad_norm": 1.962306022644043,
"learning_rate": 5.318367983829392e-06,
"loss": 0.0308,
"step": 309
},
{
"epoch": 2.576763485477178,
"grad_norm": 1.333689570426941,
"learning_rate": 5.1193136180493095e-06,
"loss": 0.0488,
"step": 310
},
{
"epoch": 2.5850622406639006,
"grad_norm": 1.2325903177261353,
"learning_rate": 4.9238546099592e-06,
"loss": 0.0345,
"step": 311
},
{
"epoch": 2.5933609958506225,
"grad_norm": 1.9375916719436646,
"learning_rate": 4.732006617631729e-06,
"loss": 0.0257,
"step": 312
},
{
"epoch": 2.6016597510373445,
"grad_norm": 7.875622749328613,
"learning_rate": 4.54378500986381e-06,
"loss": 0.2938,
"step": 313
},
{
"epoch": 2.6099585062240664,
"grad_norm": 1.0272877216339111,
"learning_rate": 4.3592048649454594e-06,
"loss": 0.0183,
"step": 314
},
{
"epoch": 2.6182572614107884,
"grad_norm": 3.053288459777832,
"learning_rate": 4.178280969451853e-06,
"loss": 0.0555,
"step": 315
},
{
"epoch": 2.6265560165975104,
"grad_norm": 1.3870058059692383,
"learning_rate": 4.001027817058789e-06,
"loss": 0.0233,
"step": 316
},
{
"epoch": 2.6348547717842323,
"grad_norm": 2.4694643020629883,
"learning_rate": 3.827459607381678e-06,
"loss": 0.0883,
"step": 317
},
{
"epoch": 2.6431535269709543,
"grad_norm": 1.914157748222351,
"learning_rate": 3.6575902448379105e-06,
"loss": 0.0428,
"step": 318
},
{
"epoch": 2.6514522821576763,
"grad_norm": 2.42271089553833,
"learning_rate": 3.4914333375330898e-06,
"loss": 0.0574,
"step": 319
},
{
"epoch": 2.659751037344398,
"grad_norm": 1.2127854824066162,
"learning_rate": 3.329002196170816e-06,
"loss": 0.0504,
"step": 320
},
{
"epoch": 2.66804979253112,
"grad_norm": 2.0530266761779785,
"learning_rate": 3.1703098329864233e-06,
"loss": 0.0504,
"step": 321
},
{
"epoch": 2.6763485477178426,
"grad_norm": 1.7026387453079224,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.1294,
"step": 322
},
{
"epoch": 2.6846473029045645,
"grad_norm": 2.17067813873291,
"learning_rate": 2.864191991520848e-06,
"loss": 0.0188,
"step": 323
},
{
"epoch": 2.6929460580912865,
"grad_norm": 0.5803589820861816,
"learning_rate": 2.71679103610738e-06,
"loss": 0.0468,
"step": 324
},
{
"epoch": 2.7012448132780085,
"grad_norm": 2.2889926433563232,
"learning_rate": 2.573177902642726e-06,
"loss": 0.1123,
"step": 325
},
{
"epoch": 2.7095435684647304,
"grad_norm": 1.7318047285079956,
"learning_rate": 2.4333640958659143e-06,
"loss": 0.027,
"step": 326
},
{
"epoch": 2.7178423236514524,
"grad_norm": 2.7389817237854004,
"learning_rate": 2.2973608161547753e-06,
"loss": 0.08,
"step": 327
},
{
"epoch": 2.7261410788381744,
"grad_norm": 1.5119000673294067,
"learning_rate": 2.1651789586287442e-06,
"loss": 0.1249,
"step": 328
},
{
"epoch": 2.7344398340248963,
"grad_norm": 0.7453382015228271,
"learning_rate": 2.03682911227599e-06,
"loss": 0.0185,
"step": 329
},
{
"epoch": 2.7427385892116183,
"grad_norm": 1.6104263067245483,
"learning_rate": 1.9123215591052013e-06,
"loss": 0.0319,
"step": 330
},
{
"epoch": 2.7510373443983402,
"grad_norm": 2.6384763717651367,
"learning_rate": 1.7916662733218847e-06,
"loss": 0.1004,
"step": 331
},
{
"epoch": 2.759336099585062,
"grad_norm": 4.27977180480957,
"learning_rate": 1.6748729205293023e-06,
"loss": 0.1176,
"step": 332
},
{
"epoch": 2.767634854771784,
"grad_norm": 2.4761962890625,
"learning_rate": 1.5619508569542363e-06,
"loss": 0.0396,
"step": 333
},
{
"epoch": 2.775933609958506,
"grad_norm": 2.1308181285858154,
"learning_rate": 1.4529091286973995e-06,
"loss": 0.0385,
"step": 334
},
{
"epoch": 2.784232365145228,
"grad_norm": 1.79630446434021,
"learning_rate": 1.3477564710088098e-06,
"loss": 0.0193,
"step": 335
},
{
"epoch": 2.79253112033195,
"grad_norm": 6.298658847808838,
"learning_rate": 1.2465013075879883e-06,
"loss": 0.0399,
"step": 336
},
{
"epoch": 2.800829875518672,
"grad_norm": 1.5001986026763916,
"learning_rate": 1.1491517499091498e-06,
"loss": 0.0225,
"step": 337
},
{
"epoch": 2.809128630705394,
"grad_norm": 1.3318809270858765,
"learning_rate": 1.055715596571405e-06,
"loss": 0.0212,
"step": 338
},
{
"epoch": 2.817427385892116,
"grad_norm": 0.8632059693336487,
"learning_rate": 9.662003326740166e-07,
"loss": 0.0136,
"step": 339
},
{
"epoch": 2.825726141078838,
"grad_norm": 2.7577383518218994,
"learning_rate": 8.806131292167618e-07,
"loss": 0.068,
"step": 340
},
{
"epoch": 2.8340248962655603,
"grad_norm": 1.1640796661376953,
"learning_rate": 7.989608425254924e-07,
"loss": 0.0268,
"step": 341
},
{
"epoch": 2.8423236514522823,
"grad_norm": 2.3888018131256104,
"learning_rate": 7.212500137028788e-07,
"loss": 0.0754,
"step": 342
},
{
"epoch": 2.8506224066390042,
"grad_norm": 3.372357130050659,
"learning_rate": 6.474868681043578e-07,
"loss": 0.1583,
"step": 343
},
{
"epoch": 2.858921161825726,
"grad_norm": 0.9823101162910461,
"learning_rate": 5.776773148394976e-07,
"loss": 0.0163,
"step": 344
},
{
"epoch": 2.867219917012448,
"grad_norm": 1.4585671424865723,
"learning_rate": 5.118269462985714e-07,
"loss": 0.0248,
"step": 345
},
{
"epoch": 2.87551867219917,
"grad_norm": 2.173081636428833,
"learning_rate": 4.4994103770457653e-07,
"loss": 0.0443,
"step": 346
},
{
"epoch": 2.883817427385892,
"grad_norm": 1.831309199333191,
"learning_rate": 3.920245466906391e-07,
"loss": 0.0713,
"step": 347
},
{
"epoch": 2.892116182572614,
"grad_norm": 1.361579179763794,
"learning_rate": 3.380821129028489e-07,
"loss": 0.0208,
"step": 348
},
{
"epoch": 2.900414937759336,
"grad_norm": 1.126133680343628,
"learning_rate": 2.8811805762860576e-07,
"loss": 0.01,
"step": 349
},
{
"epoch": 2.908713692946058,
"grad_norm": 0.6526811122894287,
"learning_rate": 2.421363834504087e-07,
"loss": 0.0093,
"step": 350
},
{
"epoch": 2.908713692946058,
"eval_loss": 0.32943063974380493,
"eval_runtime": 7.1396,
"eval_samples_per_second": 28.433,
"eval_steps_per_second": 7.143,
"step": 350
}
],
"logging_steps": 1,
"max_steps": 361,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.390780807643136e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}