Taiwan-ELM-1_1B / trainer_state.json
liswei's picture
End of training
b428ad2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 13550,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007380073800738007,
"grad_norm": 100.91170501708984,
"learning_rate": 7.380073800738008e-07,
"loss": 15.7032,
"step": 10
},
{
"epoch": 0.0014760147601476014,
"grad_norm": 53.92641830444336,
"learning_rate": 1.4760147601476015e-06,
"loss": 12.3986,
"step": 20
},
{
"epoch": 0.002214022140221402,
"grad_norm": 22.292367935180664,
"learning_rate": 2.2140221402214023e-06,
"loss": 10.8644,
"step": 30
},
{
"epoch": 0.002952029520295203,
"grad_norm": 9.494361877441406,
"learning_rate": 2.952029520295203e-06,
"loss": 10.0464,
"step": 40
},
{
"epoch": 0.0036900369003690036,
"grad_norm": 5.657174110412598,
"learning_rate": 3.690036900369004e-06,
"loss": 9.8267,
"step": 50
},
{
"epoch": 0.004428044280442804,
"grad_norm": 11.480326652526855,
"learning_rate": 4.428044280442805e-06,
"loss": 9.6732,
"step": 60
},
{
"epoch": 0.0051660516605166054,
"grad_norm": 32.78021240234375,
"learning_rate": 5.166051660516605e-06,
"loss": 9.6283,
"step": 70
},
{
"epoch": 0.005904059040590406,
"grad_norm": 26.172266006469727,
"learning_rate": 5.904059040590406e-06,
"loss": 9.4751,
"step": 80
},
{
"epoch": 0.006642066420664207,
"grad_norm": 33.70742416381836,
"learning_rate": 6.642066420664207e-06,
"loss": 9.3959,
"step": 90
},
{
"epoch": 0.007380073800738007,
"grad_norm": 63.11279296875,
"learning_rate": 7.380073800738008e-06,
"loss": 9.3828,
"step": 100
},
{
"epoch": 0.008118081180811807,
"grad_norm": 15.8975191116333,
"learning_rate": 8.118081180811808e-06,
"loss": 9.2352,
"step": 110
},
{
"epoch": 0.008856088560885609,
"grad_norm": 12.312295913696289,
"learning_rate": 8.85608856088561e-06,
"loss": 9.1436,
"step": 120
},
{
"epoch": 0.00959409594095941,
"grad_norm": 10.606693267822266,
"learning_rate": 9.59409594095941e-06,
"loss": 8.8854,
"step": 130
},
{
"epoch": 0.010332103321033211,
"grad_norm": 16.000524520874023,
"learning_rate": 1.033210332103321e-05,
"loss": 8.703,
"step": 140
},
{
"epoch": 0.01107011070110701,
"grad_norm": 14.297750473022461,
"learning_rate": 1.1070110701107012e-05,
"loss": 8.5243,
"step": 150
},
{
"epoch": 0.011808118081180811,
"grad_norm": 11.472665786743164,
"learning_rate": 1.1808118081180812e-05,
"loss": 8.232,
"step": 160
},
{
"epoch": 0.012546125461254613,
"grad_norm": 7.633975028991699,
"learning_rate": 1.2546125461254612e-05,
"loss": 8.0453,
"step": 170
},
{
"epoch": 0.013284132841328414,
"grad_norm": 7.606258869171143,
"learning_rate": 1.3284132841328414e-05,
"loss": 7.9445,
"step": 180
},
{
"epoch": 0.014022140221402213,
"grad_norm": 13.680715560913086,
"learning_rate": 1.4022140221402214e-05,
"loss": 7.9335,
"step": 190
},
{
"epoch": 0.014760147601476014,
"grad_norm": 10.28775405883789,
"learning_rate": 1.4760147601476015e-05,
"loss": 7.7923,
"step": 200
},
{
"epoch": 0.015498154981549815,
"grad_norm": 7.461697101593018,
"learning_rate": 1.5498154981549817e-05,
"loss": 7.763,
"step": 210
},
{
"epoch": 0.016236162361623615,
"grad_norm": 4.384743690490723,
"learning_rate": 1.6236162361623615e-05,
"loss": 7.7702,
"step": 220
},
{
"epoch": 0.016974169741697416,
"grad_norm": 5.806989669799805,
"learning_rate": 1.6974169741697417e-05,
"loss": 7.76,
"step": 230
},
{
"epoch": 0.017712177121771217,
"grad_norm": 5.75732421875,
"learning_rate": 1.771217712177122e-05,
"loss": 7.6101,
"step": 240
},
{
"epoch": 0.01845018450184502,
"grad_norm": 3.3278968334198,
"learning_rate": 1.845018450184502e-05,
"loss": 7.5669,
"step": 250
},
{
"epoch": 0.01918819188191882,
"grad_norm": 5.252697467803955,
"learning_rate": 1.918819188191882e-05,
"loss": 7.3905,
"step": 260
},
{
"epoch": 0.01992619926199262,
"grad_norm": 3.135658025741577,
"learning_rate": 1.992619926199262e-05,
"loss": 7.3472,
"step": 270
},
{
"epoch": 0.020664206642066422,
"grad_norm": 5.030785083770752,
"learning_rate": 2.066420664206642e-05,
"loss": 7.2426,
"step": 280
},
{
"epoch": 0.021402214022140223,
"grad_norm": 4.882932186126709,
"learning_rate": 2.140221402214022e-05,
"loss": 7.0541,
"step": 290
},
{
"epoch": 0.02214022140221402,
"grad_norm": 2.2638933658599854,
"learning_rate": 2.2140221402214025e-05,
"loss": 7.0113,
"step": 300
},
{
"epoch": 0.022878228782287822,
"grad_norm": 4.782796859741211,
"learning_rate": 2.2878228782287826e-05,
"loss": 6.8661,
"step": 310
},
{
"epoch": 0.023616236162361623,
"grad_norm": 1.9799453020095825,
"learning_rate": 2.3616236162361624e-05,
"loss": 7.0323,
"step": 320
},
{
"epoch": 0.024354243542435424,
"grad_norm": 4.8417558670043945,
"learning_rate": 2.4354243542435426e-05,
"loss": 6.8865,
"step": 330
},
{
"epoch": 0.025092250922509225,
"grad_norm": 4.531852722167969,
"learning_rate": 2.5092250922509224e-05,
"loss": 6.7982,
"step": 340
},
{
"epoch": 0.025830258302583026,
"grad_norm": 3.0997111797332764,
"learning_rate": 2.5830258302583026e-05,
"loss": 6.79,
"step": 350
},
{
"epoch": 0.026568265682656828,
"grad_norm": 3.2981128692626953,
"learning_rate": 2.6568265682656828e-05,
"loss": 6.7459,
"step": 360
},
{
"epoch": 0.02730627306273063,
"grad_norm": 3.313589572906494,
"learning_rate": 2.730627306273063e-05,
"loss": 6.637,
"step": 370
},
{
"epoch": 0.028044280442804426,
"grad_norm": 2.1940433979034424,
"learning_rate": 2.8044280442804427e-05,
"loss": 6.5645,
"step": 380
},
{
"epoch": 0.028782287822878228,
"grad_norm": 3.6912360191345215,
"learning_rate": 2.878228782287823e-05,
"loss": 6.4398,
"step": 390
},
{
"epoch": 0.02952029520295203,
"grad_norm": 3.37406325340271,
"learning_rate": 2.952029520295203e-05,
"loss": 6.4774,
"step": 400
},
{
"epoch": 0.03025830258302583,
"grad_norm": 3.22963285446167,
"learning_rate": 3.0258302583025832e-05,
"loss": 6.3106,
"step": 410
},
{
"epoch": 0.03099630996309963,
"grad_norm": 2.419431686401367,
"learning_rate": 3.0996309963099634e-05,
"loss": 6.3172,
"step": 420
},
{
"epoch": 0.03173431734317343,
"grad_norm": 2.677661895751953,
"learning_rate": 3.173431734317343e-05,
"loss": 6.1359,
"step": 430
},
{
"epoch": 0.03247232472324723,
"grad_norm": 2.795398712158203,
"learning_rate": 3.247232472324723e-05,
"loss": 6.2412,
"step": 440
},
{
"epoch": 0.033210332103321034,
"grad_norm": 2.9979288578033447,
"learning_rate": 3.3210332103321035e-05,
"loss": 6.2192,
"step": 450
},
{
"epoch": 0.03394833948339483,
"grad_norm": 3.352975845336914,
"learning_rate": 3.3948339483394833e-05,
"loss": 6.1654,
"step": 460
},
{
"epoch": 0.03468634686346864,
"grad_norm": 2.6526570320129395,
"learning_rate": 3.468634686346864e-05,
"loss": 6.0669,
"step": 470
},
{
"epoch": 0.035424354243542434,
"grad_norm": 2.950063467025757,
"learning_rate": 3.542435424354244e-05,
"loss": 6.0332,
"step": 480
},
{
"epoch": 0.03616236162361624,
"grad_norm": 4.221488952636719,
"learning_rate": 3.6162361623616235e-05,
"loss": 5.9708,
"step": 490
},
{
"epoch": 0.03690036900369004,
"grad_norm": 2.6405985355377197,
"learning_rate": 3.690036900369004e-05,
"loss": 5.76,
"step": 500
},
{
"epoch": 0.037638376383763834,
"grad_norm": 4.019585132598877,
"learning_rate": 3.763837638376384e-05,
"loss": 5.9036,
"step": 510
},
{
"epoch": 0.03837638376383764,
"grad_norm": 2.687580108642578,
"learning_rate": 3.837638376383764e-05,
"loss": 5.808,
"step": 520
},
{
"epoch": 0.03911439114391144,
"grad_norm": 3.339268207550049,
"learning_rate": 3.911439114391144e-05,
"loss": 5.7391,
"step": 530
},
{
"epoch": 0.03985239852398524,
"grad_norm": 3.0441882610321045,
"learning_rate": 3.985239852398524e-05,
"loss": 5.7045,
"step": 540
},
{
"epoch": 0.04059040590405904,
"grad_norm": 2.8957650661468506,
"learning_rate": 4.0590405904059045e-05,
"loss": 5.6956,
"step": 550
},
{
"epoch": 0.041328413284132844,
"grad_norm": 3.6834869384765625,
"learning_rate": 4.132841328413284e-05,
"loss": 5.6103,
"step": 560
},
{
"epoch": 0.04206642066420664,
"grad_norm": 3.4573564529418945,
"learning_rate": 4.206642066420665e-05,
"loss": 5.4107,
"step": 570
},
{
"epoch": 0.042804428044280446,
"grad_norm": 3.2341487407684326,
"learning_rate": 4.280442804428044e-05,
"loss": 5.4208,
"step": 580
},
{
"epoch": 0.043542435424354244,
"grad_norm": 3.6147806644439697,
"learning_rate": 4.3542435424354244e-05,
"loss": 5.4217,
"step": 590
},
{
"epoch": 0.04428044280442804,
"grad_norm": 3.6139488220214844,
"learning_rate": 4.428044280442805e-05,
"loss": 5.3994,
"step": 600
},
{
"epoch": 0.045018450184501846,
"grad_norm": 3.277580499649048,
"learning_rate": 4.501845018450185e-05,
"loss": 5.3605,
"step": 610
},
{
"epoch": 0.045756457564575644,
"grad_norm": 2.5641043186187744,
"learning_rate": 4.575645756457565e-05,
"loss": 5.1682,
"step": 620
},
{
"epoch": 0.04649446494464945,
"grad_norm": 2.422578811645508,
"learning_rate": 4.6494464944649444e-05,
"loss": 5.1585,
"step": 630
},
{
"epoch": 0.047232472324723246,
"grad_norm": 4.027858257293701,
"learning_rate": 4.723247232472325e-05,
"loss": 5.2147,
"step": 640
},
{
"epoch": 0.04797047970479705,
"grad_norm": 2.401747226715088,
"learning_rate": 4.797047970479705e-05,
"loss": 5.0839,
"step": 650
},
{
"epoch": 0.04870848708487085,
"grad_norm": 2.9220759868621826,
"learning_rate": 4.870848708487085e-05,
"loss": 5.1216,
"step": 660
},
{
"epoch": 0.04944649446494465,
"grad_norm": 2.4891719818115234,
"learning_rate": 4.944649446494466e-05,
"loss": 4.9789,
"step": 670
},
{
"epoch": 0.05018450184501845,
"grad_norm": 2.279683828353882,
"learning_rate": 5.018450184501845e-05,
"loss": 4.9611,
"step": 680
},
{
"epoch": 0.05092250922509225,
"grad_norm": 2.045536518096924,
"learning_rate": 5.0922509225092254e-05,
"loss": 4.8993,
"step": 690
},
{
"epoch": 0.05166051660516605,
"grad_norm": 1.9132373332977295,
"learning_rate": 5.166051660516605e-05,
"loss": 4.8083,
"step": 700
},
{
"epoch": 0.05239852398523985,
"grad_norm": 2.304215669631958,
"learning_rate": 5.239852398523986e-05,
"loss": 4.828,
"step": 710
},
{
"epoch": 0.053136531365313655,
"grad_norm": 2.2891597747802734,
"learning_rate": 5.3136531365313655e-05,
"loss": 4.7838,
"step": 720
},
{
"epoch": 0.05387453874538745,
"grad_norm": 2.411600351333618,
"learning_rate": 5.387453874538746e-05,
"loss": 4.6931,
"step": 730
},
{
"epoch": 0.05461254612546126,
"grad_norm": 1.6772541999816895,
"learning_rate": 5.461254612546126e-05,
"loss": 4.7027,
"step": 740
},
{
"epoch": 0.055350553505535055,
"grad_norm": 1.7979137897491455,
"learning_rate": 5.535055350553506e-05,
"loss": 4.7452,
"step": 750
},
{
"epoch": 0.05608856088560885,
"grad_norm": 2.3298912048339844,
"learning_rate": 5.6088560885608855e-05,
"loss": 4.7062,
"step": 760
},
{
"epoch": 0.05682656826568266,
"grad_norm": 1.986875295639038,
"learning_rate": 5.682656826568265e-05,
"loss": 4.6142,
"step": 770
},
{
"epoch": 0.057564575645756455,
"grad_norm": 1.8501532077789307,
"learning_rate": 5.756457564575646e-05,
"loss": 4.4524,
"step": 780
},
{
"epoch": 0.05830258302583026,
"grad_norm": 1.5959872007369995,
"learning_rate": 5.830258302583026e-05,
"loss": 4.5299,
"step": 790
},
{
"epoch": 0.05904059040590406,
"grad_norm": 2.339456796646118,
"learning_rate": 5.904059040590406e-05,
"loss": 4.4703,
"step": 800
},
{
"epoch": 0.05977859778597786,
"grad_norm": 1.6436880826950073,
"learning_rate": 5.9778597785977866e-05,
"loss": 4.463,
"step": 810
},
{
"epoch": 0.06051660516605166,
"grad_norm": 1.7336505651474,
"learning_rate": 6.0516605166051664e-05,
"loss": 4.4363,
"step": 820
},
{
"epoch": 0.061254612546125464,
"grad_norm": 1.691726565361023,
"learning_rate": 6.125461254612547e-05,
"loss": 4.4099,
"step": 830
},
{
"epoch": 0.06199261992619926,
"grad_norm": 1.5019862651824951,
"learning_rate": 6.199261992619927e-05,
"loss": 4.4153,
"step": 840
},
{
"epoch": 0.06273062730627306,
"grad_norm": 1.4851793050765991,
"learning_rate": 6.273062730627307e-05,
"loss": 4.4721,
"step": 850
},
{
"epoch": 0.06346863468634686,
"grad_norm": 1.4793798923492432,
"learning_rate": 6.346863468634686e-05,
"loss": 4.3287,
"step": 860
},
{
"epoch": 0.06420664206642067,
"grad_norm": 1.5791796445846558,
"learning_rate": 6.420664206642066e-05,
"loss": 4.3766,
"step": 870
},
{
"epoch": 0.06494464944649446,
"grad_norm": 1.5449219942092896,
"learning_rate": 6.494464944649446e-05,
"loss": 4.2412,
"step": 880
},
{
"epoch": 0.06568265682656826,
"grad_norm": 1.229464054107666,
"learning_rate": 6.568265682656827e-05,
"loss": 4.1558,
"step": 890
},
{
"epoch": 0.06642066420664207,
"grad_norm": 1.5863291025161743,
"learning_rate": 6.642066420664207e-05,
"loss": 4.2074,
"step": 900
},
{
"epoch": 0.06715867158671587,
"grad_norm": 1.319446086883545,
"learning_rate": 6.715867158671587e-05,
"loss": 4.258,
"step": 910
},
{
"epoch": 0.06789667896678966,
"grad_norm": 1.3132025003433228,
"learning_rate": 6.789667896678967e-05,
"loss": 4.1444,
"step": 920
},
{
"epoch": 0.06863468634686347,
"grad_norm": 1.5694645643234253,
"learning_rate": 6.863468634686348e-05,
"loss": 4.1201,
"step": 930
},
{
"epoch": 0.06937269372693727,
"grad_norm": 1.4163988828659058,
"learning_rate": 6.937269372693728e-05,
"loss": 4.1113,
"step": 940
},
{
"epoch": 0.07011070110701106,
"grad_norm": 1.487798810005188,
"learning_rate": 7.011070110701108e-05,
"loss": 4.0805,
"step": 950
},
{
"epoch": 0.07084870848708487,
"grad_norm": 1.2213908433914185,
"learning_rate": 7.084870848708487e-05,
"loss": 4.0324,
"step": 960
},
{
"epoch": 0.07158671586715867,
"grad_norm": 1.332588791847229,
"learning_rate": 7.158671586715867e-05,
"loss": 4.0455,
"step": 970
},
{
"epoch": 0.07232472324723248,
"grad_norm": 1.212963342666626,
"learning_rate": 7.232472324723247e-05,
"loss": 4.0044,
"step": 980
},
{
"epoch": 0.07306273062730627,
"grad_norm": 1.0928430557250977,
"learning_rate": 7.306273062730628e-05,
"loss": 4.0215,
"step": 990
},
{
"epoch": 0.07380073800738007,
"grad_norm": 1.1430400609970093,
"learning_rate": 7.380073800738008e-05,
"loss": 4.0744,
"step": 1000
},
{
"epoch": 0.07453874538745388,
"grad_norm": 0.9975944757461548,
"learning_rate": 7.453874538745388e-05,
"loss": 3.9955,
"step": 1010
},
{
"epoch": 0.07527675276752767,
"grad_norm": 1.1288777589797974,
"learning_rate": 7.527675276752768e-05,
"loss": 3.9916,
"step": 1020
},
{
"epoch": 0.07601476014760147,
"grad_norm": 1.0064688920974731,
"learning_rate": 7.601476014760149e-05,
"loss": 3.9025,
"step": 1030
},
{
"epoch": 0.07675276752767528,
"grad_norm": 1.329229474067688,
"learning_rate": 7.675276752767529e-05,
"loss": 3.9822,
"step": 1040
},
{
"epoch": 0.07749077490774908,
"grad_norm": 1.022760033607483,
"learning_rate": 7.749077490774908e-05,
"loss": 3.8762,
"step": 1050
},
{
"epoch": 0.07822878228782287,
"grad_norm": 1.0934398174285889,
"learning_rate": 7.822878228782288e-05,
"loss": 3.7911,
"step": 1060
},
{
"epoch": 0.07896678966789668,
"grad_norm": 1.008171796798706,
"learning_rate": 7.896678966789668e-05,
"loss": 3.8972,
"step": 1070
},
{
"epoch": 0.07970479704797048,
"grad_norm": 1.1563254594802856,
"learning_rate": 7.970479704797048e-05,
"loss": 3.7901,
"step": 1080
},
{
"epoch": 0.08044280442804429,
"grad_norm": 1.06783926486969,
"learning_rate": 8.044280442804428e-05,
"loss": 3.9768,
"step": 1090
},
{
"epoch": 0.08118081180811808,
"grad_norm": 1.0809143781661987,
"learning_rate": 8.118081180811809e-05,
"loss": 3.7534,
"step": 1100
},
{
"epoch": 0.08191881918819188,
"grad_norm": 1.040157675743103,
"learning_rate": 8.191881918819189e-05,
"loss": 3.7609,
"step": 1110
},
{
"epoch": 0.08265682656826569,
"grad_norm": 1.0198458433151245,
"learning_rate": 8.265682656826569e-05,
"loss": 3.8031,
"step": 1120
},
{
"epoch": 0.08339483394833948,
"grad_norm": 1.3017419576644897,
"learning_rate": 8.339483394833948e-05,
"loss": 3.8526,
"step": 1130
},
{
"epoch": 0.08413284132841328,
"grad_norm": 0.9285693168640137,
"learning_rate": 8.41328413284133e-05,
"loss": 3.7551,
"step": 1140
},
{
"epoch": 0.08487084870848709,
"grad_norm": 0.9603882431983948,
"learning_rate": 8.48708487084871e-05,
"loss": 3.834,
"step": 1150
},
{
"epoch": 0.08560885608856089,
"grad_norm": 0.9195291996002197,
"learning_rate": 8.560885608856088e-05,
"loss": 3.7804,
"step": 1160
},
{
"epoch": 0.08634686346863468,
"grad_norm": 1.0526838302612305,
"learning_rate": 8.634686346863469e-05,
"loss": 3.8757,
"step": 1170
},
{
"epoch": 0.08708487084870849,
"grad_norm": 0.8891322612762451,
"learning_rate": 8.708487084870849e-05,
"loss": 3.7513,
"step": 1180
},
{
"epoch": 0.08782287822878229,
"grad_norm": 0.9467900395393372,
"learning_rate": 8.782287822878229e-05,
"loss": 3.7555,
"step": 1190
},
{
"epoch": 0.08856088560885608,
"grad_norm": 1.0294831991195679,
"learning_rate": 8.85608856088561e-05,
"loss": 3.834,
"step": 1200
},
{
"epoch": 0.08929889298892989,
"grad_norm": 1.0832924842834473,
"learning_rate": 8.92988929889299e-05,
"loss": 3.6299,
"step": 1210
},
{
"epoch": 0.09003690036900369,
"grad_norm": 0.9595062732696533,
"learning_rate": 9.00369003690037e-05,
"loss": 3.7695,
"step": 1220
},
{
"epoch": 0.0907749077490775,
"grad_norm": 0.8714928030967712,
"learning_rate": 9.077490774907749e-05,
"loss": 3.7346,
"step": 1230
},
{
"epoch": 0.09151291512915129,
"grad_norm": 0.9189225435256958,
"learning_rate": 9.15129151291513e-05,
"loss": 3.7646,
"step": 1240
},
{
"epoch": 0.09225092250922509,
"grad_norm": 1.0212230682373047,
"learning_rate": 9.22509225092251e-05,
"loss": 3.6518,
"step": 1250
},
{
"epoch": 0.0929889298892989,
"grad_norm": 0.8631012439727783,
"learning_rate": 9.298892988929889e-05,
"loss": 3.6702,
"step": 1260
},
{
"epoch": 0.09372693726937269,
"grad_norm": 0.76339191198349,
"learning_rate": 9.37269372693727e-05,
"loss": 3.6757,
"step": 1270
},
{
"epoch": 0.09446494464944649,
"grad_norm": 0.8459323048591614,
"learning_rate": 9.44649446494465e-05,
"loss": 3.8173,
"step": 1280
},
{
"epoch": 0.0952029520295203,
"grad_norm": 0.7884580492973328,
"learning_rate": 9.52029520295203e-05,
"loss": 3.6719,
"step": 1290
},
{
"epoch": 0.0959409594095941,
"grad_norm": 0.9069279432296753,
"learning_rate": 9.59409594095941e-05,
"loss": 3.6447,
"step": 1300
},
{
"epoch": 0.09667896678966789,
"grad_norm": 0.8386545181274414,
"learning_rate": 9.66789667896679e-05,
"loss": 3.6681,
"step": 1310
},
{
"epoch": 0.0974169741697417,
"grad_norm": 0.8082497119903564,
"learning_rate": 9.74169741697417e-05,
"loss": 3.6526,
"step": 1320
},
{
"epoch": 0.0981549815498155,
"grad_norm": 0.7619675993919373,
"learning_rate": 9.81549815498155e-05,
"loss": 3.6717,
"step": 1330
},
{
"epoch": 0.0988929889298893,
"grad_norm": 0.803425133228302,
"learning_rate": 9.889298892988931e-05,
"loss": 3.5892,
"step": 1340
},
{
"epoch": 0.0996309963099631,
"grad_norm": 0.8372170925140381,
"learning_rate": 9.963099630996311e-05,
"loss": 3.6615,
"step": 1350
},
{
"epoch": 0.1003690036900369,
"grad_norm": 0.8343318700790405,
"learning_rate": 9.999995852216369e-05,
"loss": 3.5785,
"step": 1360
},
{
"epoch": 0.1011070110701107,
"grad_norm": 0.8367707133293152,
"learning_rate": 9.999962669988607e-05,
"loss": 3.625,
"step": 1370
},
{
"epoch": 0.1018450184501845,
"grad_norm": 0.8662716150283813,
"learning_rate": 9.999896305753297e-05,
"loss": 3.6656,
"step": 1380
},
{
"epoch": 0.1025830258302583,
"grad_norm": 0.747052788734436,
"learning_rate": 9.999796759950864e-05,
"loss": 3.5761,
"step": 1390
},
{
"epoch": 0.1033210332103321,
"grad_norm": 0.7763943672180176,
"learning_rate": 9.999664033241933e-05,
"loss": 3.5234,
"step": 1400
},
{
"epoch": 0.10405904059040591,
"grad_norm": 0.7435528039932251,
"learning_rate": 9.99949812650734e-05,
"loss": 3.5132,
"step": 1410
},
{
"epoch": 0.1047970479704797,
"grad_norm": 0.8303211331367493,
"learning_rate": 9.999299040848121e-05,
"loss": 3.5173,
"step": 1420
},
{
"epoch": 0.1055350553505535,
"grad_norm": 0.8359752297401428,
"learning_rate": 9.999066777585495e-05,
"loss": 3.5605,
"step": 1430
},
{
"epoch": 0.10627306273062731,
"grad_norm": 0.909545361995697,
"learning_rate": 9.998801338260865e-05,
"loss": 3.5839,
"step": 1440
},
{
"epoch": 0.1070110701107011,
"grad_norm": 0.845916748046875,
"learning_rate": 9.99850272463581e-05,
"loss": 3.5685,
"step": 1450
},
{
"epoch": 0.1077490774907749,
"grad_norm": 0.834235429763794,
"learning_rate": 9.99817093869206e-05,
"loss": 3.5476,
"step": 1460
},
{
"epoch": 0.10848708487084871,
"grad_norm": 0.7273171544075012,
"learning_rate": 9.997805982631499e-05,
"loss": 3.4777,
"step": 1470
},
{
"epoch": 0.10922509225092251,
"grad_norm": 0.839796245098114,
"learning_rate": 9.99740785887614e-05,
"loss": 3.5084,
"step": 1480
},
{
"epoch": 0.1099630996309963,
"grad_norm": 0.7638348340988159,
"learning_rate": 9.99697657006811e-05,
"loss": 3.5741,
"step": 1490
},
{
"epoch": 0.11070110701107011,
"grad_norm": 0.7195069193840027,
"learning_rate": 9.996512119069636e-05,
"loss": 3.5083,
"step": 1500
},
{
"epoch": 0.11143911439114391,
"grad_norm": 0.7351711392402649,
"learning_rate": 9.996014508963028e-05,
"loss": 3.365,
"step": 1510
},
{
"epoch": 0.1121771217712177,
"grad_norm": 0.7192705869674683,
"learning_rate": 9.995483743050648e-05,
"loss": 3.5233,
"step": 1520
},
{
"epoch": 0.11291512915129151,
"grad_norm": 0.7362285256385803,
"learning_rate": 9.994919824854898e-05,
"loss": 3.5548,
"step": 1530
},
{
"epoch": 0.11365313653136531,
"grad_norm": 0.6908057928085327,
"learning_rate": 9.994322758118196e-05,
"loss": 3.4293,
"step": 1540
},
{
"epoch": 0.11439114391143912,
"grad_norm": 0.7892534136772156,
"learning_rate": 9.993692546802941e-05,
"loss": 3.4583,
"step": 1550
},
{
"epoch": 0.11512915129151291,
"grad_norm": 0.7085639834403992,
"learning_rate": 9.993029195091505e-05,
"loss": 3.4349,
"step": 1560
},
{
"epoch": 0.11586715867158671,
"grad_norm": 0.7825974225997925,
"learning_rate": 9.992332707386188e-05,
"loss": 3.4496,
"step": 1570
},
{
"epoch": 0.11660516605166052,
"grad_norm": 0.7284643054008484,
"learning_rate": 9.991603088309194e-05,
"loss": 3.517,
"step": 1580
},
{
"epoch": 0.11734317343173432,
"grad_norm": 0.7682483792304993,
"learning_rate": 9.990840342702606e-05,
"loss": 3.4505,
"step": 1590
},
{
"epoch": 0.11808118081180811,
"grad_norm": 0.8391796350479126,
"learning_rate": 9.990044475628347e-05,
"loss": 3.5077,
"step": 1600
},
{
"epoch": 0.11881918819188192,
"grad_norm": 0.7043576836585999,
"learning_rate": 9.989215492368151e-05,
"loss": 3.4272,
"step": 1610
},
{
"epoch": 0.11955719557195572,
"grad_norm": 0.72553551197052,
"learning_rate": 9.988353398423527e-05,
"loss": 3.3559,
"step": 1620
},
{
"epoch": 0.12029520295202951,
"grad_norm": 0.7156850099563599,
"learning_rate": 9.987458199515713e-05,
"loss": 3.4108,
"step": 1630
},
{
"epoch": 0.12103321033210332,
"grad_norm": 0.6410751342773438,
"learning_rate": 9.98652990158566e-05,
"loss": 3.4688,
"step": 1640
},
{
"epoch": 0.12177121771217712,
"grad_norm": 0.8124927282333374,
"learning_rate": 9.985568510793967e-05,
"loss": 3.4611,
"step": 1650
},
{
"epoch": 0.12250922509225093,
"grad_norm": 0.7403334379196167,
"learning_rate": 9.984574033520857e-05,
"loss": 3.4669,
"step": 1660
},
{
"epoch": 0.12324723247232472,
"grad_norm": 0.662948727607727,
"learning_rate": 9.983546476366132e-05,
"loss": 3.4798,
"step": 1670
},
{
"epoch": 0.12398523985239852,
"grad_norm": 0.6987183690071106,
"learning_rate": 9.982485846149125e-05,
"loss": 3.3932,
"step": 1680
},
{
"epoch": 0.12472324723247233,
"grad_norm": 0.650486171245575,
"learning_rate": 9.981392149908652e-05,
"loss": 3.3856,
"step": 1690
},
{
"epoch": 0.12546125461254612,
"grad_norm": 0.6416191458702087,
"learning_rate": 9.98026539490298e-05,
"loss": 3.455,
"step": 1700
},
{
"epoch": 0.12619926199261994,
"grad_norm": 0.6319407820701599,
"learning_rate": 9.979105588609762e-05,
"loss": 3.4001,
"step": 1710
},
{
"epoch": 0.12693726937269373,
"grad_norm": 0.6667493581771851,
"learning_rate": 9.977912738725994e-05,
"loss": 3.4277,
"step": 1720
},
{
"epoch": 0.12767527675276752,
"grad_norm": 0.6686265468597412,
"learning_rate": 9.976686853167967e-05,
"loss": 3.4075,
"step": 1730
},
{
"epoch": 0.12841328413284134,
"grad_norm": 0.731555700302124,
"learning_rate": 9.975427940071211e-05,
"loss": 3.4226,
"step": 1740
},
{
"epoch": 0.12915129151291513,
"grad_norm": 0.6553905606269836,
"learning_rate": 9.97413600779044e-05,
"loss": 3.4306,
"step": 1750
},
{
"epoch": 0.12988929889298892,
"grad_norm": 0.7509811520576477,
"learning_rate": 9.9728110648995e-05,
"loss": 3.3937,
"step": 1760
},
{
"epoch": 0.13062730627306274,
"grad_norm": 0.7052728533744812,
"learning_rate": 9.971453120191309e-05,
"loss": 3.3822,
"step": 1770
},
{
"epoch": 0.13136531365313653,
"grad_norm": 0.6742541790008545,
"learning_rate": 9.970062182677801e-05,
"loss": 3.3824,
"step": 1780
},
{
"epoch": 0.13210332103321032,
"grad_norm": 0.6257262825965881,
"learning_rate": 9.968638261589866e-05,
"loss": 3.4047,
"step": 1790
},
{
"epoch": 0.13284132841328414,
"grad_norm": 0.6546107530593872,
"learning_rate": 9.967181366377285e-05,
"loss": 3.3903,
"step": 1800
},
{
"epoch": 0.13357933579335793,
"grad_norm": 0.8019782304763794,
"learning_rate": 9.965691506708672e-05,
"loss": 3.3911,
"step": 1810
},
{
"epoch": 0.13431734317343175,
"grad_norm": 0.6207643151283264,
"learning_rate": 9.964168692471408e-05,
"loss": 3.3861,
"step": 1820
},
{
"epoch": 0.13505535055350554,
"grad_norm": 0.6750718355178833,
"learning_rate": 9.962612933771576e-05,
"loss": 3.4424,
"step": 1830
},
{
"epoch": 0.13579335793357933,
"grad_norm": 0.9330940246582031,
"learning_rate": 9.961024240933892e-05,
"loss": 3.3459,
"step": 1840
},
{
"epoch": 0.13653136531365315,
"grad_norm": 0.7058202028274536,
"learning_rate": 9.959402624501636e-05,
"loss": 3.3327,
"step": 1850
},
{
"epoch": 0.13726937269372694,
"grad_norm": 0.779712438583374,
"learning_rate": 9.957748095236589e-05,
"loss": 3.4398,
"step": 1860
},
{
"epoch": 0.13800738007380073,
"grad_norm": 0.663960337638855,
"learning_rate": 9.956060664118951e-05,
"loss": 3.3513,
"step": 1870
},
{
"epoch": 0.13874538745387455,
"grad_norm": 0.756618082523346,
"learning_rate": 9.954340342347279e-05,
"loss": 3.304,
"step": 1880
},
{
"epoch": 0.13948339483394834,
"grad_norm": 0.7523687481880188,
"learning_rate": 9.952587141338403e-05,
"loss": 3.3155,
"step": 1890
},
{
"epoch": 0.14022140221402213,
"grad_norm": 0.6524930596351624,
"learning_rate": 9.950801072727356e-05,
"loss": 3.3803,
"step": 1900
},
{
"epoch": 0.14095940959409595,
"grad_norm": 0.7161090970039368,
"learning_rate": 9.948982148367292e-05,
"loss": 3.4219,
"step": 1910
},
{
"epoch": 0.14169741697416974,
"grad_norm": 0.7181054949760437,
"learning_rate": 9.947130380329418e-05,
"loss": 3.301,
"step": 1920
},
{
"epoch": 0.14243542435424356,
"grad_norm": 0.6185216903686523,
"learning_rate": 9.945245780902899e-05,
"loss": 3.3666,
"step": 1930
},
{
"epoch": 0.14317343173431735,
"grad_norm": 0.6279731392860413,
"learning_rate": 9.943328362594788e-05,
"loss": 3.2862,
"step": 1940
},
{
"epoch": 0.14391143911439114,
"grad_norm": 0.6401661038398743,
"learning_rate": 9.941378138129938e-05,
"loss": 3.3112,
"step": 1950
},
{
"epoch": 0.14464944649446496,
"grad_norm": 0.6105781197547913,
"learning_rate": 9.939395120450916e-05,
"loss": 3.3539,
"step": 1960
},
{
"epoch": 0.14538745387453875,
"grad_norm": 0.6660001873970032,
"learning_rate": 9.937379322717924e-05,
"loss": 3.3722,
"step": 1970
},
{
"epoch": 0.14612546125461254,
"grad_norm": 0.6415931582450867,
"learning_rate": 9.935330758308705e-05,
"loss": 3.3329,
"step": 1980
},
{
"epoch": 0.14686346863468636,
"grad_norm": 0.6147580742835999,
"learning_rate": 9.933249440818455e-05,
"loss": 3.2807,
"step": 1990
},
{
"epoch": 0.14760147601476015,
"grad_norm": 0.694519579410553,
"learning_rate": 9.931135384059736e-05,
"loss": 3.2662,
"step": 2000
},
{
"epoch": 0.14833948339483394,
"grad_norm": 0.6452217102050781,
"learning_rate": 9.928988602062384e-05,
"loss": 3.2942,
"step": 2010
},
{
"epoch": 0.14907749077490776,
"grad_norm": 0.6983804106712341,
"learning_rate": 9.926809109073412e-05,
"loss": 3.2639,
"step": 2020
},
{
"epoch": 0.14981549815498155,
"grad_norm": 0.6302483677864075,
"learning_rate": 9.924596919556917e-05,
"loss": 3.3648,
"step": 2030
},
{
"epoch": 0.15055350553505534,
"grad_norm": 0.6506009697914124,
"learning_rate": 9.922352048193986e-05,
"loss": 3.3417,
"step": 2040
},
{
"epoch": 0.15129151291512916,
"grad_norm": 0.6232055425643921,
"learning_rate": 9.920074509882602e-05,
"loss": 3.3304,
"step": 2050
},
{
"epoch": 0.15202952029520295,
"grad_norm": 0.6454508900642395,
"learning_rate": 9.917764319737533e-05,
"loss": 3.2585,
"step": 2060
},
{
"epoch": 0.15276752767527677,
"grad_norm": 0.6281662583351135,
"learning_rate": 9.915421493090243e-05,
"loss": 3.2753,
"step": 2070
},
{
"epoch": 0.15350553505535056,
"grad_norm": 0.7222394943237305,
"learning_rate": 9.913046045488786e-05,
"loss": 3.2683,
"step": 2080
},
{
"epoch": 0.15424354243542435,
"grad_norm": 0.6333222389221191,
"learning_rate": 9.910637992697707e-05,
"loss": 3.2676,
"step": 2090
},
{
"epoch": 0.15498154981549817,
"grad_norm": 0.6758008003234863,
"learning_rate": 9.908197350697926e-05,
"loss": 3.2941,
"step": 2100
},
{
"epoch": 0.15571955719557196,
"grad_norm": 0.5930529832839966,
"learning_rate": 9.905724135686648e-05,
"loss": 3.3365,
"step": 2110
},
{
"epoch": 0.15645756457564575,
"grad_norm": 0.7024756669998169,
"learning_rate": 9.903218364077243e-05,
"loss": 3.2594,
"step": 2120
},
{
"epoch": 0.15719557195571957,
"grad_norm": 0.6018502712249756,
"learning_rate": 9.900680052499138e-05,
"loss": 3.3316,
"step": 2130
},
{
"epoch": 0.15793357933579336,
"grad_norm": 0.6856579184532166,
"learning_rate": 9.898109217797717e-05,
"loss": 3.3196,
"step": 2140
},
{
"epoch": 0.15867158671586715,
"grad_norm": 0.6864190101623535,
"learning_rate": 9.895505877034198e-05,
"loss": 3.3116,
"step": 2150
},
{
"epoch": 0.15940959409594097,
"grad_norm": 0.57015061378479,
"learning_rate": 9.892870047485526e-05,
"loss": 3.3119,
"step": 2160
},
{
"epoch": 0.16014760147601476,
"grad_norm": 0.5812332630157471,
"learning_rate": 9.89020174664425e-05,
"loss": 3.2727,
"step": 2170
},
{
"epoch": 0.16088560885608857,
"grad_norm": 0.6356363296508789,
"learning_rate": 9.887500992218421e-05,
"loss": 3.3661,
"step": 2180
},
{
"epoch": 0.16162361623616237,
"grad_norm": 0.672024130821228,
"learning_rate": 9.884767802131465e-05,
"loss": 3.3215,
"step": 2190
},
{
"epoch": 0.16236162361623616,
"grad_norm": 0.6531562805175781,
"learning_rate": 9.882002194522064e-05,
"loss": 3.2374,
"step": 2200
},
{
"epoch": 0.16309963099630997,
"grad_norm": 0.6039624214172363,
"learning_rate": 9.879204187744036e-05,
"loss": 3.2342,
"step": 2210
},
{
"epoch": 0.16383763837638377,
"grad_norm": 0.5702035427093506,
"learning_rate": 9.876373800366215e-05,
"loss": 3.3181,
"step": 2220
},
{
"epoch": 0.16457564575645756,
"grad_norm": 0.6860033273696899,
"learning_rate": 9.87351105117233e-05,
"loss": 3.3758,
"step": 2230
},
{
"epoch": 0.16531365313653137,
"grad_norm": 0.6462620496749878,
"learning_rate": 9.870615959160875e-05,
"loss": 3.3542,
"step": 2240
},
{
"epoch": 0.16605166051660517,
"grad_norm": 0.6575970649719238,
"learning_rate": 9.867688543544988e-05,
"loss": 3.2135,
"step": 2250
},
{
"epoch": 0.16678966789667896,
"grad_norm": 0.6185761094093323,
"learning_rate": 9.86472882375232e-05,
"loss": 3.294,
"step": 2260
},
{
"epoch": 0.16752767527675277,
"grad_norm": 0.6141475439071655,
"learning_rate": 9.861736819424902e-05,
"loss": 3.1992,
"step": 2270
},
{
"epoch": 0.16826568265682657,
"grad_norm": 0.6172120571136475,
"learning_rate": 9.85871255041903e-05,
"loss": 3.2167,
"step": 2280
},
{
"epoch": 0.16900369003690036,
"grad_norm": 0.5904815196990967,
"learning_rate": 9.855656036805114e-05,
"loss": 3.2945,
"step": 2290
},
{
"epoch": 0.16974169741697417,
"grad_norm": 0.6383630633354187,
"learning_rate": 9.852567298867557e-05,
"loss": 3.2865,
"step": 2300
},
{
"epoch": 0.17047970479704797,
"grad_norm": 0.60262531042099,
"learning_rate": 9.84944635710462e-05,
"loss": 3.2188,
"step": 2310
},
{
"epoch": 0.17121771217712178,
"grad_norm": 0.5909958481788635,
"learning_rate": 9.846293232228274e-05,
"loss": 3.2896,
"step": 2320
},
{
"epoch": 0.17195571955719557,
"grad_norm": 0.5554500818252563,
"learning_rate": 9.843107945164086e-05,
"loss": 3.1705,
"step": 2330
},
{
"epoch": 0.17269372693726937,
"grad_norm": 0.620606005191803,
"learning_rate": 9.83989051705105e-05,
"loss": 3.2288,
"step": 2340
},
{
"epoch": 0.17343173431734318,
"grad_norm": 0.6841108202934265,
"learning_rate": 9.836640969241475e-05,
"loss": 3.2441,
"step": 2350
},
{
"epoch": 0.17416974169741697,
"grad_norm": 0.6839698553085327,
"learning_rate": 9.833359323300826e-05,
"loss": 3.2246,
"step": 2360
},
{
"epoch": 0.17490774907749077,
"grad_norm": 0.7128744721412659,
"learning_rate": 9.830045601007584e-05,
"loss": 3.2008,
"step": 2370
},
{
"epoch": 0.17564575645756458,
"grad_norm": 0.65251624584198,
"learning_rate": 9.826699824353106e-05,
"loss": 3.3275,
"step": 2380
},
{
"epoch": 0.17638376383763837,
"grad_norm": 0.5380867123603821,
"learning_rate": 9.823322015541474e-05,
"loss": 3.2064,
"step": 2390
},
{
"epoch": 0.17712177121771217,
"grad_norm": 0.5963719487190247,
"learning_rate": 9.819912196989351e-05,
"loss": 3.1643,
"step": 2400
},
{
"epoch": 0.17785977859778598,
"grad_norm": 0.8703069090843201,
"learning_rate": 9.816470391325832e-05,
"loss": 3.1848,
"step": 2410
},
{
"epoch": 0.17859778597785977,
"grad_norm": 0.608935534954071,
"learning_rate": 9.81299662139229e-05,
"loss": 3.2719,
"step": 2420
},
{
"epoch": 0.1793357933579336,
"grad_norm": 0.6425730586051941,
"learning_rate": 9.809490910242229e-05,
"loss": 3.2619,
"step": 2430
},
{
"epoch": 0.18007380073800738,
"grad_norm": 0.5790001749992371,
"learning_rate": 9.805953281141131e-05,
"loss": 3.243,
"step": 2440
},
{
"epoch": 0.18081180811808117,
"grad_norm": 0.6436141133308411,
"learning_rate": 9.802383757566301e-05,
"loss": 3.2284,
"step": 2450
},
{
"epoch": 0.181549815498155,
"grad_norm": 0.5458927154541016,
"learning_rate": 9.798782363206702e-05,
"loss": 3.2043,
"step": 2460
},
{
"epoch": 0.18228782287822878,
"grad_norm": 0.6296219229698181,
"learning_rate": 9.795149121962815e-05,
"loss": 3.2683,
"step": 2470
},
{
"epoch": 0.18302583025830257,
"grad_norm": 0.6964813470840454,
"learning_rate": 9.791484057946465e-05,
"loss": 3.1977,
"step": 2480
},
{
"epoch": 0.1837638376383764,
"grad_norm": 0.5911018252372742,
"learning_rate": 9.787787195480672e-05,
"loss": 3.2263,
"step": 2490
},
{
"epoch": 0.18450184501845018,
"grad_norm": 0.5431626439094543,
"learning_rate": 9.784058559099483e-05,
"loss": 3.1628,
"step": 2500
},
{
"epoch": 0.18523985239852397,
"grad_norm": 0.6068975329399109,
"learning_rate": 9.78029817354781e-05,
"loss": 3.1828,
"step": 2510
},
{
"epoch": 0.1859778597785978,
"grad_norm": 0.580287516117096,
"learning_rate": 9.776506063781269e-05,
"loss": 3.2248,
"step": 2520
},
{
"epoch": 0.18671586715867158,
"grad_norm": 0.6136944890022278,
"learning_rate": 9.772682254966008e-05,
"loss": 3.2495,
"step": 2530
},
{
"epoch": 0.18745387453874537,
"grad_norm": 0.6076098680496216,
"learning_rate": 9.76882677247855e-05,
"loss": 3.1979,
"step": 2540
},
{
"epoch": 0.1881918819188192,
"grad_norm": 0.5682818293571472,
"learning_rate": 9.764939641905615e-05,
"loss": 3.1714,
"step": 2550
},
{
"epoch": 0.18892988929889298,
"grad_norm": 0.5991480350494385,
"learning_rate": 9.761020889043954e-05,
"loss": 3.154,
"step": 2560
},
{
"epoch": 0.1896678966789668,
"grad_norm": 0.6232896447181702,
"learning_rate": 9.75707053990018e-05,
"loss": 3.2036,
"step": 2570
},
{
"epoch": 0.1904059040590406,
"grad_norm": 0.5560643672943115,
"learning_rate": 9.75308862069059e-05,
"loss": 3.2392,
"step": 2580
},
{
"epoch": 0.19114391143911438,
"grad_norm": 0.5718569755554199,
"learning_rate": 9.749075157840996e-05,
"loss": 3.2528,
"step": 2590
},
{
"epoch": 0.1918819188191882,
"grad_norm": 0.5662999749183655,
"learning_rate": 9.74503017798655e-05,
"loss": 3.2256,
"step": 2600
},
{
"epoch": 0.192619926199262,
"grad_norm": 0.6026265621185303,
"learning_rate": 9.74095370797156e-05,
"loss": 3.2183,
"step": 2610
},
{
"epoch": 0.19335793357933578,
"grad_norm": 0.6032066941261292,
"learning_rate": 9.736845774849321e-05,
"loss": 3.2418,
"step": 2620
},
{
"epoch": 0.1940959409594096,
"grad_norm": 0.5830618143081665,
"learning_rate": 9.732706405881931e-05,
"loss": 3.191,
"step": 2630
},
{
"epoch": 0.1948339483394834,
"grad_norm": 0.5695509314537048,
"learning_rate": 9.728535628540109e-05,
"loss": 3.1968,
"step": 2640
},
{
"epoch": 0.19557195571955718,
"grad_norm": 0.5905478000640869,
"learning_rate": 9.724333470503013e-05,
"loss": 3.2596,
"step": 2650
},
{
"epoch": 0.196309963099631,
"grad_norm": 0.5251249670982361,
"learning_rate": 9.720099959658062e-05,
"loss": 3.1729,
"step": 2660
},
{
"epoch": 0.1970479704797048,
"grad_norm": 0.6502349972724915,
"learning_rate": 9.715835124100742e-05,
"loss": 3.2604,
"step": 2670
},
{
"epoch": 0.1977859778597786,
"grad_norm": 0.6250560283660889,
"learning_rate": 9.711538992134426e-05,
"loss": 3.2194,
"step": 2680
},
{
"epoch": 0.1985239852398524,
"grad_norm": 0.5793785452842712,
"learning_rate": 9.707211592270183e-05,
"loss": 3.1994,
"step": 2690
},
{
"epoch": 0.1992619926199262,
"grad_norm": 0.6495150327682495,
"learning_rate": 9.70285295322659e-05,
"loss": 3.1919,
"step": 2700
},
{
"epoch": 0.2,
"grad_norm": 0.5875915288925171,
"learning_rate": 9.698463103929542e-05,
"loss": 3.2464,
"step": 2710
},
{
"epoch": 0.2007380073800738,
"grad_norm": 0.5518725514411926,
"learning_rate": 9.69404207351206e-05,
"loss": 3.2042,
"step": 2720
},
{
"epoch": 0.2014760147601476,
"grad_norm": 0.5390283465385437,
"learning_rate": 9.689589891314094e-05,
"loss": 3.2012,
"step": 2730
},
{
"epoch": 0.2022140221402214,
"grad_norm": 0.5596645474433899,
"learning_rate": 9.685106586882336e-05,
"loss": 3.2053,
"step": 2740
},
{
"epoch": 0.2029520295202952,
"grad_norm": 0.5377479195594788,
"learning_rate": 9.680592189970015e-05,
"loss": 3.177,
"step": 2750
},
{
"epoch": 0.203690036900369,
"grad_norm": 0.5858853459358215,
"learning_rate": 9.676046730536704e-05,
"loss": 3.2039,
"step": 2760
},
{
"epoch": 0.2044280442804428,
"grad_norm": 0.5771840810775757,
"learning_rate": 9.671470238748124e-05,
"loss": 3.1654,
"step": 2770
},
{
"epoch": 0.2051660516605166,
"grad_norm": 0.5626157522201538,
"learning_rate": 9.666862744975938e-05,
"loss": 3.1978,
"step": 2780
},
{
"epoch": 0.2059040590405904,
"grad_norm": 0.5536968111991882,
"learning_rate": 9.662224279797552e-05,
"loss": 3.2152,
"step": 2790
},
{
"epoch": 0.2066420664206642,
"grad_norm": 0.5982388854026794,
"learning_rate": 9.657554873995913e-05,
"loss": 3.1699,
"step": 2800
},
{
"epoch": 0.207380073800738,
"grad_norm": 0.5761833190917969,
"learning_rate": 9.652854558559308e-05,
"loss": 3.1766,
"step": 2810
},
{
"epoch": 0.20811808118081182,
"grad_norm": 0.5907506346702576,
"learning_rate": 9.648123364681145e-05,
"loss": 3.0935,
"step": 2820
},
{
"epoch": 0.2088560885608856,
"grad_norm": 0.5584788918495178,
"learning_rate": 9.643361323759763e-05,
"loss": 3.1111,
"step": 2830
},
{
"epoch": 0.2095940959409594,
"grad_norm": 0.5568063855171204,
"learning_rate": 9.638568467398215e-05,
"loss": 3.1739,
"step": 2840
},
{
"epoch": 0.21033210332103322,
"grad_norm": 0.5453604459762573,
"learning_rate": 9.633744827404055e-05,
"loss": 3.2064,
"step": 2850
},
{
"epoch": 0.211070110701107,
"grad_norm": 0.6171849966049194,
"learning_rate": 9.628890435789135e-05,
"loss": 3.2281,
"step": 2860
},
{
"epoch": 0.2118081180811808,
"grad_norm": 0.5285280346870422,
"learning_rate": 9.624005324769388e-05,
"loss": 3.113,
"step": 2870
},
{
"epoch": 0.21254612546125462,
"grad_norm": 0.5632630586624146,
"learning_rate": 9.619089526764614e-05,
"loss": 3.1592,
"step": 2880
},
{
"epoch": 0.2132841328413284,
"grad_norm": 0.6024160385131836,
"learning_rate": 9.614143074398264e-05,
"loss": 3.1904,
"step": 2890
},
{
"epoch": 0.2140221402214022,
"grad_norm": 0.5437342524528503,
"learning_rate": 9.609166000497229e-05,
"loss": 3.1156,
"step": 2900
},
{
"epoch": 0.21476014760147602,
"grad_norm": 0.5884766578674316,
"learning_rate": 9.604158338091615e-05,
"loss": 3.1888,
"step": 2910
},
{
"epoch": 0.2154981549815498,
"grad_norm": 0.547242283821106,
"learning_rate": 9.599120120414531e-05,
"loss": 3.1079,
"step": 2920
},
{
"epoch": 0.21623616236162363,
"grad_norm": 0.5443885326385498,
"learning_rate": 9.594051380901859e-05,
"loss": 3.1147,
"step": 2930
},
{
"epoch": 0.21697416974169742,
"grad_norm": 0.5350677371025085,
"learning_rate": 9.588952153192041e-05,
"loss": 3.1061,
"step": 2940
},
{
"epoch": 0.2177121771217712,
"grad_norm": 0.5434796214103699,
"learning_rate": 9.583822471125854e-05,
"loss": 3.1172,
"step": 2950
},
{
"epoch": 0.21845018450184503,
"grad_norm": 0.5185326933860779,
"learning_rate": 9.578662368746182e-05,
"loss": 3.2186,
"step": 2960
},
{
"epoch": 0.21918819188191882,
"grad_norm": 0.5394032001495361,
"learning_rate": 9.57347188029779e-05,
"loss": 3.1628,
"step": 2970
},
{
"epoch": 0.2199261992619926,
"grad_norm": 0.5857832431793213,
"learning_rate": 9.568251040227101e-05,
"loss": 3.1291,
"step": 2980
},
{
"epoch": 0.22066420664206643,
"grad_norm": 0.6189760565757751,
"learning_rate": 9.562999883181967e-05,
"loss": 3.1305,
"step": 2990
},
{
"epoch": 0.22140221402214022,
"grad_norm": 0.5518510937690735,
"learning_rate": 9.557718444011431e-05,
"loss": 3.2148,
"step": 3000
},
{
"epoch": 0.222140221402214,
"grad_norm": 0.5947515964508057,
"learning_rate": 9.552406757765509e-05,
"loss": 3.1322,
"step": 3010
},
{
"epoch": 0.22287822878228783,
"grad_norm": 0.5554746985435486,
"learning_rate": 9.547064859694943e-05,
"loss": 3.1822,
"step": 3020
},
{
"epoch": 0.22361623616236162,
"grad_norm": 0.5308244824409485,
"learning_rate": 9.541692785250981e-05,
"loss": 3.1371,
"step": 3030
},
{
"epoch": 0.2243542435424354,
"grad_norm": 0.5285702347755432,
"learning_rate": 9.536290570085131e-05,
"loss": 3.1329,
"step": 3040
},
{
"epoch": 0.22509225092250923,
"grad_norm": 0.5468854904174805,
"learning_rate": 9.530858250048932e-05,
"loss": 3.2538,
"step": 3050
},
{
"epoch": 0.22583025830258302,
"grad_norm": 0.5449059009552002,
"learning_rate": 9.525395861193707e-05,
"loss": 3.2139,
"step": 3060
},
{
"epoch": 0.22656826568265684,
"grad_norm": 0.5692685842514038,
"learning_rate": 9.519903439770332e-05,
"loss": 3.1138,
"step": 3070
},
{
"epoch": 0.22730627306273063,
"grad_norm": 0.5263866782188416,
"learning_rate": 9.514381022228997e-05,
"loss": 3.0872,
"step": 3080
},
{
"epoch": 0.22804428044280442,
"grad_norm": 0.5696788430213928,
"learning_rate": 9.50882864521895e-05,
"loss": 3.167,
"step": 3090
},
{
"epoch": 0.22878228782287824,
"grad_norm": 0.5760169625282288,
"learning_rate": 9.503246345588274e-05,
"loss": 3.15,
"step": 3100
},
{
"epoch": 0.22952029520295203,
"grad_norm": 0.5390339493751526,
"learning_rate": 9.497634160383626e-05,
"loss": 3.1367,
"step": 3110
},
{
"epoch": 0.23025830258302582,
"grad_norm": 0.5490269660949707,
"learning_rate": 9.491992126849997e-05,
"loss": 3.1779,
"step": 3120
},
{
"epoch": 0.23099630996309964,
"grad_norm": 0.5177121758460999,
"learning_rate": 9.486320282430468e-05,
"loss": 3.0789,
"step": 3130
},
{
"epoch": 0.23173431734317343,
"grad_norm": 0.5448027849197388,
"learning_rate": 9.480618664765955e-05,
"loss": 3.1866,
"step": 3140
},
{
"epoch": 0.23247232472324722,
"grad_norm": 0.5371176600456238,
"learning_rate": 9.474887311694968e-05,
"loss": 3.2089,
"step": 3150
},
{
"epoch": 0.23321033210332104,
"grad_norm": 0.6013469099998474,
"learning_rate": 9.469126261253348e-05,
"loss": 3.1159,
"step": 3160
},
{
"epoch": 0.23394833948339483,
"grad_norm": 0.5597007274627686,
"learning_rate": 9.463335551674025e-05,
"loss": 3.124,
"step": 3170
},
{
"epoch": 0.23468634686346865,
"grad_norm": 0.5460641384124756,
"learning_rate": 9.45751522138676e-05,
"loss": 3.103,
"step": 3180
},
{
"epoch": 0.23542435424354244,
"grad_norm": 0.5389031767845154,
"learning_rate": 9.45166530901789e-05,
"loss": 3.1502,
"step": 3190
},
{
"epoch": 0.23616236162361623,
"grad_norm": 0.5293789505958557,
"learning_rate": 9.445785853390073e-05,
"loss": 3.0856,
"step": 3200
},
{
"epoch": 0.23690036900369005,
"grad_norm": 0.677259087562561,
"learning_rate": 9.439876893522028e-05,
"loss": 3.1143,
"step": 3210
},
{
"epoch": 0.23763837638376384,
"grad_norm": 0.5259451866149902,
"learning_rate": 9.433938468628277e-05,
"loss": 3.1628,
"step": 3220
},
{
"epoch": 0.23837638376383763,
"grad_norm": 0.5321341156959534,
"learning_rate": 9.427970618118888e-05,
"loss": 3.1164,
"step": 3230
},
{
"epoch": 0.23911439114391145,
"grad_norm": 0.5752614140510559,
"learning_rate": 9.421973381599208e-05,
"loss": 3.0361,
"step": 3240
},
{
"epoch": 0.23985239852398524,
"grad_norm": 0.5552977323532104,
"learning_rate": 9.415946798869602e-05,
"loss": 3.1452,
"step": 3250
},
{
"epoch": 0.24059040590405903,
"grad_norm": 0.5862517952919006,
"learning_rate": 9.409890909925193e-05,
"loss": 3.1493,
"step": 3260
},
{
"epoch": 0.24132841328413285,
"grad_norm": 0.5374996066093445,
"learning_rate": 9.40380575495559e-05,
"loss": 3.1315,
"step": 3270
},
{
"epoch": 0.24206642066420664,
"grad_norm": 0.5315213203430176,
"learning_rate": 9.39769137434463e-05,
"loss": 3.1218,
"step": 3280
},
{
"epoch": 0.24280442804428043,
"grad_norm": 0.5306174159049988,
"learning_rate": 9.391547808670096e-05,
"loss": 3.0916,
"step": 3290
},
{
"epoch": 0.24354243542435425,
"grad_norm": 0.5105913281440735,
"learning_rate": 9.385375098703465e-05,
"loss": 3.0469,
"step": 3300
},
{
"epoch": 0.24428044280442804,
"grad_norm": 0.5171898603439331,
"learning_rate": 9.379173285409621e-05,
"loss": 3.068,
"step": 3310
},
{
"epoch": 0.24501845018450186,
"grad_norm": 0.5028154253959656,
"learning_rate": 9.372942409946596e-05,
"loss": 3.1542,
"step": 3320
},
{
"epoch": 0.24575645756457565,
"grad_norm": 0.5281797647476196,
"learning_rate": 9.366682513665293e-05,
"loss": 3.1484,
"step": 3330
},
{
"epoch": 0.24649446494464944,
"grad_norm": 0.5240592956542969,
"learning_rate": 9.360393638109201e-05,
"loss": 3.103,
"step": 3340
},
{
"epoch": 0.24723247232472326,
"grad_norm": 0.5516790747642517,
"learning_rate": 9.354075825014139e-05,
"loss": 3.0701,
"step": 3350
},
{
"epoch": 0.24797047970479705,
"grad_norm": 0.6081251502037048,
"learning_rate": 9.347729116307964e-05,
"loss": 3.1434,
"step": 3360
},
{
"epoch": 0.24870848708487084,
"grad_norm": 0.5216418504714966,
"learning_rate": 9.341353554110297e-05,
"loss": 3.1567,
"step": 3370
},
{
"epoch": 0.24944649446494466,
"grad_norm": 0.5264909863471985,
"learning_rate": 9.334949180732245e-05,
"loss": 3.162,
"step": 3380
},
{
"epoch": 0.25018450184501845,
"grad_norm": 0.4942391812801361,
"learning_rate": 9.328516038676119e-05,
"loss": 3.1532,
"step": 3390
},
{
"epoch": 0.25092250922509224,
"grad_norm": 0.5401615500450134,
"learning_rate": 9.322054170635149e-05,
"loss": 3.1,
"step": 3400
},
{
"epoch": 0.25166051660516603,
"grad_norm": 0.5021462440490723,
"learning_rate": 9.315563619493209e-05,
"loss": 3.0438,
"step": 3410
},
{
"epoch": 0.2523985239852399,
"grad_norm": 0.5627569556236267,
"learning_rate": 9.309044428324522e-05,
"loss": 3.2005,
"step": 3420
},
{
"epoch": 0.25313653136531367,
"grad_norm": 0.514385461807251,
"learning_rate": 9.302496640393382e-05,
"loss": 3.1035,
"step": 3430
},
{
"epoch": 0.25387453874538746,
"grad_norm": 0.5261507630348206,
"learning_rate": 9.295920299153863e-05,
"loss": 3.1706,
"step": 3440
},
{
"epoch": 0.25461254612546125,
"grad_norm": 0.5069513916969299,
"learning_rate": 9.289315448249531e-05,
"loss": 3.1218,
"step": 3450
},
{
"epoch": 0.25535055350553504,
"grad_norm": 0.49072757363319397,
"learning_rate": 9.282682131513157e-05,
"loss": 3.1231,
"step": 3460
},
{
"epoch": 0.25608856088560883,
"grad_norm": 0.6358250379562378,
"learning_rate": 9.276020392966422e-05,
"loss": 3.1082,
"step": 3470
},
{
"epoch": 0.2568265682656827,
"grad_norm": 0.5456467270851135,
"learning_rate": 9.26933027681963e-05,
"loss": 3.1454,
"step": 3480
},
{
"epoch": 0.25756457564575647,
"grad_norm": 0.5754953026771545,
"learning_rate": 9.262611827471406e-05,
"loss": 3.1334,
"step": 3490
},
{
"epoch": 0.25830258302583026,
"grad_norm": 0.5355437397956848,
"learning_rate": 9.25586508950841e-05,
"loss": 3.0149,
"step": 3500
},
{
"epoch": 0.25904059040590405,
"grad_norm": 0.5386449694633484,
"learning_rate": 9.249090107705044e-05,
"loss": 3.1859,
"step": 3510
},
{
"epoch": 0.25977859778597784,
"grad_norm": 0.5665399432182312,
"learning_rate": 9.242286927023136e-05,
"loss": 3.171,
"step": 3520
},
{
"epoch": 0.2605166051660517,
"grad_norm": 0.5453583002090454,
"learning_rate": 9.235455592611665e-05,
"loss": 3.1198,
"step": 3530
},
{
"epoch": 0.2612546125461255,
"grad_norm": 0.5409013032913208,
"learning_rate": 9.22859614980645e-05,
"loss": 3.0841,
"step": 3540
},
{
"epoch": 0.26199261992619927,
"grad_norm": 0.5243815779685974,
"learning_rate": 9.221708644129843e-05,
"loss": 3.13,
"step": 3550
},
{
"epoch": 0.26273062730627306,
"grad_norm": 0.562589168548584,
"learning_rate": 9.214793121290442e-05,
"loss": 3.0718,
"step": 3560
},
{
"epoch": 0.26346863468634685,
"grad_norm": 0.5075133442878723,
"learning_rate": 9.207849627182772e-05,
"loss": 3.1159,
"step": 3570
},
{
"epoch": 0.26420664206642064,
"grad_norm": 0.5348154902458191,
"learning_rate": 9.200878207886993e-05,
"loss": 3.1932,
"step": 3580
},
{
"epoch": 0.2649446494464945,
"grad_norm": 0.5550357103347778,
"learning_rate": 9.19387890966859e-05,
"loss": 3.0973,
"step": 3590
},
{
"epoch": 0.2656826568265683,
"grad_norm": 0.534482479095459,
"learning_rate": 9.186851778978062e-05,
"loss": 3.1466,
"step": 3600
},
{
"epoch": 0.26642066420664207,
"grad_norm": 0.521537184715271,
"learning_rate": 9.179796862450618e-05,
"loss": 3.0424,
"step": 3610
},
{
"epoch": 0.26715867158671586,
"grad_norm": 0.5350748896598816,
"learning_rate": 9.172714206905866e-05,
"loss": 3.0505,
"step": 3620
},
{
"epoch": 0.26789667896678965,
"grad_norm": 0.5348935127258301,
"learning_rate": 9.165603859347502e-05,
"loss": 3.1561,
"step": 3630
},
{
"epoch": 0.2686346863468635,
"grad_norm": 0.5182725191116333,
"learning_rate": 9.158465866963002e-05,
"loss": 3.0778,
"step": 3640
},
{
"epoch": 0.2693726937269373,
"grad_norm": 0.5188565850257874,
"learning_rate": 9.151300277123301e-05,
"loss": 3.0517,
"step": 3650
},
{
"epoch": 0.2701107011070111,
"grad_norm": 0.5163888931274414,
"learning_rate": 9.144107137382484e-05,
"loss": 2.979,
"step": 3660
},
{
"epoch": 0.27084870848708487,
"grad_norm": 0.5174587965011597,
"learning_rate": 9.136886495477475e-05,
"loss": 3.0661,
"step": 3670
},
{
"epoch": 0.27158671586715866,
"grad_norm": 0.5590752363204956,
"learning_rate": 9.129638399327706e-05,
"loss": 3.0624,
"step": 3680
},
{
"epoch": 0.27232472324723245,
"grad_norm": 0.48960742354393005,
"learning_rate": 9.122362897034817e-05,
"loss": 3.0344,
"step": 3690
},
{
"epoch": 0.2730627306273063,
"grad_norm": 0.5071660876274109,
"learning_rate": 9.115060036882318e-05,
"loss": 3.0374,
"step": 3700
},
{
"epoch": 0.2738007380073801,
"grad_norm": 0.5058993697166443,
"learning_rate": 9.107729867335288e-05,
"loss": 3.0823,
"step": 3710
},
{
"epoch": 0.2745387453874539,
"grad_norm": 0.5252380967140198,
"learning_rate": 9.100372437040034e-05,
"loss": 3.0558,
"step": 3720
},
{
"epoch": 0.27527675276752767,
"grad_norm": 0.49785932898521423,
"learning_rate": 9.092987794823786e-05,
"loss": 3.0836,
"step": 3730
},
{
"epoch": 0.27601476014760146,
"grad_norm": 0.5140420794487,
"learning_rate": 9.085575989694357e-05,
"loss": 3.1079,
"step": 3740
},
{
"epoch": 0.2767527675276753,
"grad_norm": 0.5329453945159912,
"learning_rate": 9.078137070839832e-05,
"loss": 3.0775,
"step": 3750
},
{
"epoch": 0.2774907749077491,
"grad_norm": 0.4971647560596466,
"learning_rate": 9.070671087628229e-05,
"loss": 3.0756,
"step": 3760
},
{
"epoch": 0.2782287822878229,
"grad_norm": 0.5552874803543091,
"learning_rate": 9.063178089607183e-05,
"loss": 3.0615,
"step": 3770
},
{
"epoch": 0.2789667896678967,
"grad_norm": 0.525969922542572,
"learning_rate": 9.055658126503605e-05,
"loss": 3.0594,
"step": 3780
},
{
"epoch": 0.27970479704797047,
"grad_norm": 0.5235247611999512,
"learning_rate": 9.048111248223368e-05,
"loss": 3.097,
"step": 3790
},
{
"epoch": 0.28044280442804426,
"grad_norm": 0.5573784112930298,
"learning_rate": 9.040537504850954e-05,
"loss": 3.0303,
"step": 3800
},
{
"epoch": 0.2811808118081181,
"grad_norm": 0.5464443564414978,
"learning_rate": 9.032936946649144e-05,
"loss": 3.063,
"step": 3810
},
{
"epoch": 0.2819188191881919,
"grad_norm": 0.5378391146659851,
"learning_rate": 9.02530962405867e-05,
"loss": 3.0853,
"step": 3820
},
{
"epoch": 0.2826568265682657,
"grad_norm": 0.5274621844291687,
"learning_rate": 9.017655587697885e-05,
"loss": 3.1374,
"step": 3830
},
{
"epoch": 0.2833948339483395,
"grad_norm": 0.5044965744018555,
"learning_rate": 9.009974888362424e-05,
"loss": 3.064,
"step": 3840
},
{
"epoch": 0.28413284132841327,
"grad_norm": 0.5318046808242798,
"learning_rate": 9.002267577024876e-05,
"loss": 3.0662,
"step": 3850
},
{
"epoch": 0.2848708487084871,
"grad_norm": 0.5438222289085388,
"learning_rate": 8.994533704834435e-05,
"loss": 3.0999,
"step": 3860
},
{
"epoch": 0.2856088560885609,
"grad_norm": 0.5226894021034241,
"learning_rate": 8.986773323116563e-05,
"loss": 3.0496,
"step": 3870
},
{
"epoch": 0.2863468634686347,
"grad_norm": 1.9248789548873901,
"learning_rate": 8.978986483372655e-05,
"loss": 3.0549,
"step": 3880
},
{
"epoch": 0.2870848708487085,
"grad_norm": 0.49465620517730713,
"learning_rate": 8.971173237279692e-05,
"loss": 3.085,
"step": 3890
},
{
"epoch": 0.2878228782287823,
"grad_norm": 0.5317748785018921,
"learning_rate": 8.963333636689898e-05,
"loss": 3.0659,
"step": 3900
},
{
"epoch": 0.28856088560885607,
"grad_norm": 0.5400087833404541,
"learning_rate": 8.9554677336304e-05,
"loss": 3.0963,
"step": 3910
},
{
"epoch": 0.2892988929889299,
"grad_norm": 0.5060845613479614,
"learning_rate": 8.947575580302878e-05,
"loss": 3.0503,
"step": 3920
},
{
"epoch": 0.2900369003690037,
"grad_norm": 0.5168414115905762,
"learning_rate": 8.939657229083222e-05,
"loss": 3.1322,
"step": 3930
},
{
"epoch": 0.2907749077490775,
"grad_norm": 0.5268558263778687,
"learning_rate": 8.931712732521183e-05,
"loss": 3.0947,
"step": 3940
},
{
"epoch": 0.2915129151291513,
"grad_norm": 0.5113683938980103,
"learning_rate": 8.92374214334002e-05,
"loss": 3.0379,
"step": 3950
},
{
"epoch": 0.2922509225092251,
"grad_norm": 0.5602664947509766,
"learning_rate": 8.915745514436161e-05,
"loss": 3.0636,
"step": 3960
},
{
"epoch": 0.29298892988929887,
"grad_norm": 0.507926344871521,
"learning_rate": 8.907722898878844e-05,
"loss": 3.0737,
"step": 3970
},
{
"epoch": 0.2937269372693727,
"grad_norm": 0.5805441737174988,
"learning_rate": 8.899674349909759e-05,
"loss": 3.0743,
"step": 3980
},
{
"epoch": 0.2944649446494465,
"grad_norm": 0.5141892433166504,
"learning_rate": 8.891599920942713e-05,
"loss": 3.0711,
"step": 3990
},
{
"epoch": 0.2952029520295203,
"grad_norm": 0.5769287347793579,
"learning_rate": 8.883499665563253e-05,
"loss": 3.0302,
"step": 4000
},
{
"epoch": 0.2959409594095941,
"grad_norm": 0.5248669981956482,
"learning_rate": 8.875373637528335e-05,
"loss": 3.0871,
"step": 4010
},
{
"epoch": 0.2966789667896679,
"grad_norm": 0.5001204609870911,
"learning_rate": 8.867221890765938e-05,
"loss": 3.0342,
"step": 4020
},
{
"epoch": 0.2974169741697417,
"grad_norm": 0.5176003575325012,
"learning_rate": 8.859044479374736e-05,
"loss": 3.1404,
"step": 4030
},
{
"epoch": 0.2981549815498155,
"grad_norm": 0.5125160217285156,
"learning_rate": 8.850841457623719e-05,
"loss": 3.0399,
"step": 4040
},
{
"epoch": 0.2988929889298893,
"grad_norm": 0.49271440505981445,
"learning_rate": 8.842612879951837e-05,
"loss": 3.0082,
"step": 4050
},
{
"epoch": 0.2996309963099631,
"grad_norm": 0.5456764698028564,
"learning_rate": 8.834358800967645e-05,
"loss": 3.0537,
"step": 4060
},
{
"epoch": 0.3003690036900369,
"grad_norm": 0.5039022564888,
"learning_rate": 8.826079275448933e-05,
"loss": 3.0508,
"step": 4070
},
{
"epoch": 0.3011070110701107,
"grad_norm": 0.48597994446754456,
"learning_rate": 8.817774358342367e-05,
"loss": 3.0806,
"step": 4080
},
{
"epoch": 0.3018450184501845,
"grad_norm": 0.5243167877197266,
"learning_rate": 8.809444104763122e-05,
"loss": 3.1176,
"step": 4090
},
{
"epoch": 0.3025830258302583,
"grad_norm": 0.5244473218917847,
"learning_rate": 8.801088569994522e-05,
"loss": 3.0985,
"step": 4100
},
{
"epoch": 0.3033210332103321,
"grad_norm": 0.4856514632701874,
"learning_rate": 8.792707809487661e-05,
"loss": 3.0546,
"step": 4110
},
{
"epoch": 0.3040590405904059,
"grad_norm": 0.48701879382133484,
"learning_rate": 8.784301878861047e-05,
"loss": 3.083,
"step": 4120
},
{
"epoch": 0.3047970479704797,
"grad_norm": 0.5364317297935486,
"learning_rate": 8.775870833900226e-05,
"loss": 3.0672,
"step": 4130
},
{
"epoch": 0.30553505535055353,
"grad_norm": 0.5016632676124573,
"learning_rate": 8.767414730557418e-05,
"loss": 2.9692,
"step": 4140
},
{
"epoch": 0.3062730627306273,
"grad_norm": 0.5020787715911865,
"learning_rate": 8.758933624951135e-05,
"loss": 3.0618,
"step": 4150
},
{
"epoch": 0.3070110701107011,
"grad_norm": 0.5041311383247375,
"learning_rate": 8.750427573365824e-05,
"loss": 3.0193,
"step": 4160
},
{
"epoch": 0.3077490774907749,
"grad_norm": 0.5102233290672302,
"learning_rate": 8.741896632251476e-05,
"loss": 3.0837,
"step": 4170
},
{
"epoch": 0.3084870848708487,
"grad_norm": 0.5173757672309875,
"learning_rate": 8.733340858223268e-05,
"loss": 2.9969,
"step": 4180
},
{
"epoch": 0.3092250922509225,
"grad_norm": 0.47782695293426514,
"learning_rate": 8.724760308061172e-05,
"loss": 2.9934,
"step": 4190
},
{
"epoch": 0.30996309963099633,
"grad_norm": 0.4984055161476135,
"learning_rate": 8.71615503870959e-05,
"loss": 3.0055,
"step": 4200
},
{
"epoch": 0.3107011070110701,
"grad_norm": 0.535744845867157,
"learning_rate": 8.707525107276971e-05,
"loss": 3.1124,
"step": 4210
},
{
"epoch": 0.3114391143911439,
"grad_norm": 0.5163019895553589,
"learning_rate": 8.698870571035435e-05,
"loss": 3.0904,
"step": 4220
},
{
"epoch": 0.3121771217712177,
"grad_norm": 0.5297439694404602,
"learning_rate": 8.690191487420385e-05,
"loss": 3.039,
"step": 4230
},
{
"epoch": 0.3129151291512915,
"grad_norm": 0.5315809845924377,
"learning_rate": 8.681487914030137e-05,
"loss": 3.1418,
"step": 4240
},
{
"epoch": 0.31365313653136534,
"grad_norm": 0.5038068890571594,
"learning_rate": 8.672759908625528e-05,
"loss": 3.105,
"step": 4250
},
{
"epoch": 0.31439114391143913,
"grad_norm": 0.5104600787162781,
"learning_rate": 8.664007529129539e-05,
"loss": 3.0253,
"step": 4260
},
{
"epoch": 0.3151291512915129,
"grad_norm": 0.5337395668029785,
"learning_rate": 8.655230833626908e-05,
"loss": 3.0637,
"step": 4270
},
{
"epoch": 0.3158671586715867,
"grad_norm": 0.5203779935836792,
"learning_rate": 8.646429880363746e-05,
"loss": 3.0862,
"step": 4280
},
{
"epoch": 0.3166051660516605,
"grad_norm": 0.510831356048584,
"learning_rate": 8.637604727747149e-05,
"loss": 2.9944,
"step": 4290
},
{
"epoch": 0.3173431734317343,
"grad_norm": 0.5363606214523315,
"learning_rate": 8.62875543434481e-05,
"loss": 3.1227,
"step": 4300
},
{
"epoch": 0.31808118081180814,
"grad_norm": 0.5156981945037842,
"learning_rate": 8.61988205888463e-05,
"loss": 3.046,
"step": 4310
},
{
"epoch": 0.31881918819188193,
"grad_norm": 0.530002772808075,
"learning_rate": 8.610984660254333e-05,
"loss": 3.037,
"step": 4320
},
{
"epoch": 0.3195571955719557,
"grad_norm": 0.5514121651649475,
"learning_rate": 8.602063297501068e-05,
"loss": 3.0828,
"step": 4330
},
{
"epoch": 0.3202952029520295,
"grad_norm": 0.49961575865745544,
"learning_rate": 8.593118029831025e-05,
"loss": 3.0404,
"step": 4340
},
{
"epoch": 0.3210332103321033,
"grad_norm": 0.4883437752723694,
"learning_rate": 8.584148916609032e-05,
"loss": 3.0681,
"step": 4350
},
{
"epoch": 0.32177121771217715,
"grad_norm": 0.5226607918739319,
"learning_rate": 8.575156017358171e-05,
"loss": 3.0631,
"step": 4360
},
{
"epoch": 0.32250922509225094,
"grad_norm": 0.5821093320846558,
"learning_rate": 8.566139391759378e-05,
"loss": 3.0793,
"step": 4370
},
{
"epoch": 0.32324723247232473,
"grad_norm": 0.5188676118850708,
"learning_rate": 8.557099099651047e-05,
"loss": 3.086,
"step": 4380
},
{
"epoch": 0.3239852398523985,
"grad_norm": 0.5117591023445129,
"learning_rate": 8.548035201028636e-05,
"loss": 3.1174,
"step": 4390
},
{
"epoch": 0.3247232472324723,
"grad_norm": 0.48335784673690796,
"learning_rate": 8.538947756044261e-05,
"loss": 2.9864,
"step": 4400
},
{
"epoch": 0.3254612546125461,
"grad_norm": 0.5281744599342346,
"learning_rate": 8.52983682500631e-05,
"loss": 3.0942,
"step": 4410
},
{
"epoch": 0.32619926199261995,
"grad_norm": 0.4935998022556305,
"learning_rate": 8.520702468379028e-05,
"loss": 3.0716,
"step": 4420
},
{
"epoch": 0.32693726937269374,
"grad_norm": 0.4817652404308319,
"learning_rate": 8.511544746782125e-05,
"loss": 3.0314,
"step": 4430
},
{
"epoch": 0.32767527675276753,
"grad_norm": 0.49610570073127747,
"learning_rate": 8.502363720990374e-05,
"loss": 2.9699,
"step": 4440
},
{
"epoch": 0.3284132841328413,
"grad_norm": 0.5101500749588013,
"learning_rate": 8.493159451933203e-05,
"loss": 2.9248,
"step": 4450
},
{
"epoch": 0.3291512915129151,
"grad_norm": 0.48433801531791687,
"learning_rate": 8.483932000694295e-05,
"loss": 3.0812,
"step": 4460
},
{
"epoch": 0.3298892988929889,
"grad_norm": 0.4775218665599823,
"learning_rate": 8.474681428511177e-05,
"loss": 2.986,
"step": 4470
},
{
"epoch": 0.33062730627306275,
"grad_norm": 0.49710339307785034,
"learning_rate": 8.465407796774816e-05,
"loss": 3.0331,
"step": 4480
},
{
"epoch": 0.33136531365313654,
"grad_norm": 0.5008261799812317,
"learning_rate": 8.456111167029219e-05,
"loss": 3.0763,
"step": 4490
},
{
"epoch": 0.33210332103321033,
"grad_norm": 0.5350390672683716,
"learning_rate": 8.446791600971012e-05,
"loss": 3.0238,
"step": 4500
},
{
"epoch": 0.3328413284132841,
"grad_norm": 0.5100720524787903,
"learning_rate": 8.43744916044904e-05,
"loss": 3.1137,
"step": 4510
},
{
"epoch": 0.3335793357933579,
"grad_norm": 0.5103323459625244,
"learning_rate": 8.428083907463951e-05,
"loss": 3.0862,
"step": 4520
},
{
"epoch": 0.33431734317343176,
"grad_norm": 0.563750147819519,
"learning_rate": 8.418695904167788e-05,
"loss": 3.0551,
"step": 4530
},
{
"epoch": 0.33505535055350555,
"grad_norm": 0.4909681975841522,
"learning_rate": 8.40928521286358e-05,
"loss": 2.9769,
"step": 4540
},
{
"epoch": 0.33579335793357934,
"grad_norm": 0.5330002903938293,
"learning_rate": 8.399851896004913e-05,
"loss": 3.046,
"step": 4550
},
{
"epoch": 0.33653136531365313,
"grad_norm": 0.49845483899116516,
"learning_rate": 8.390396016195537e-05,
"loss": 3.0318,
"step": 4560
},
{
"epoch": 0.3372693726937269,
"grad_norm": 0.4647519290447235,
"learning_rate": 8.380917636188934e-05,
"loss": 3.0097,
"step": 4570
},
{
"epoch": 0.3380073800738007,
"grad_norm": 0.4947097599506378,
"learning_rate": 8.371416818887908e-05,
"loss": 3.0244,
"step": 4580
},
{
"epoch": 0.33874538745387456,
"grad_norm": 0.514033854007721,
"learning_rate": 8.361893627344168e-05,
"loss": 3.0259,
"step": 4590
},
{
"epoch": 0.33948339483394835,
"grad_norm": 0.5403528213500977,
"learning_rate": 8.35234812475791e-05,
"loss": 3.0071,
"step": 4600
},
{
"epoch": 0.34022140221402214,
"grad_norm": 0.495109498500824,
"learning_rate": 8.342780374477396e-05,
"loss": 3.058,
"step": 4610
},
{
"epoch": 0.34095940959409593,
"grad_norm": 0.48301902413368225,
"learning_rate": 8.33319043999853e-05,
"loss": 3.0686,
"step": 4620
},
{
"epoch": 0.3416974169741697,
"grad_norm": 0.4977583885192871,
"learning_rate": 8.323578384964444e-05,
"loss": 2.9218,
"step": 4630
},
{
"epoch": 0.34243542435424357,
"grad_norm": 0.4929274022579193,
"learning_rate": 8.313944273165069e-05,
"loss": 3.0489,
"step": 4640
},
{
"epoch": 0.34317343173431736,
"grad_norm": 0.5092618465423584,
"learning_rate": 8.304288168536718e-05,
"loss": 2.9915,
"step": 4650
},
{
"epoch": 0.34391143911439115,
"grad_norm": 0.48645535111427307,
"learning_rate": 8.294610135161658e-05,
"loss": 2.9596,
"step": 4660
},
{
"epoch": 0.34464944649446494,
"grad_norm": 0.5053686499595642,
"learning_rate": 8.284910237267682e-05,
"loss": 3.0022,
"step": 4670
},
{
"epoch": 0.34538745387453873,
"grad_norm": 0.5074572563171387,
"learning_rate": 8.275188539227686e-05,
"loss": 3.0701,
"step": 4680
},
{
"epoch": 0.3461254612546125,
"grad_norm": 0.5153145790100098,
"learning_rate": 8.265445105559247e-05,
"loss": 2.9951,
"step": 4690
},
{
"epoch": 0.34686346863468637,
"grad_norm": 0.5247951745986938,
"learning_rate": 8.255680000924184e-05,
"loss": 3.0631,
"step": 4700
},
{
"epoch": 0.34760147601476016,
"grad_norm": 0.4750431180000305,
"learning_rate": 8.245893290128136e-05,
"loss": 3.0917,
"step": 4710
},
{
"epoch": 0.34833948339483395,
"grad_norm": 0.4787590503692627,
"learning_rate": 8.236085038120129e-05,
"loss": 3.0494,
"step": 4720
},
{
"epoch": 0.34907749077490774,
"grad_norm": 0.49496400356292725,
"learning_rate": 8.22625530999215e-05,
"loss": 3.0276,
"step": 4730
},
{
"epoch": 0.34981549815498153,
"grad_norm": 0.517461359500885,
"learning_rate": 8.216404170978707e-05,
"loss": 2.9682,
"step": 4740
},
{
"epoch": 0.3505535055350554,
"grad_norm": 0.4839133024215698,
"learning_rate": 8.206531686456403e-05,
"loss": 3.0396,
"step": 4750
},
{
"epoch": 0.35129151291512917,
"grad_norm": 0.5224480628967285,
"learning_rate": 8.196637921943496e-05,
"loss": 3.048,
"step": 4760
},
{
"epoch": 0.35202952029520296,
"grad_norm": 0.5209102034568787,
"learning_rate": 8.186722943099472e-05,
"loss": 3.0128,
"step": 4770
},
{
"epoch": 0.35276752767527675,
"grad_norm": 0.480421781539917,
"learning_rate": 8.176786815724601e-05,
"loss": 3.0139,
"step": 4780
},
{
"epoch": 0.35350553505535054,
"grad_norm": 0.4676721692085266,
"learning_rate": 8.166829605759507e-05,
"loss": 2.8988,
"step": 4790
},
{
"epoch": 0.35424354243542433,
"grad_norm": 0.5178680419921875,
"learning_rate": 8.156851379284729e-05,
"loss": 3.0074,
"step": 4800
},
{
"epoch": 0.3549815498154982,
"grad_norm": 0.5426033735275269,
"learning_rate": 8.146852202520277e-05,
"loss": 2.9998,
"step": 4810
},
{
"epoch": 0.35571955719557197,
"grad_norm": 0.4766799807548523,
"learning_rate": 8.136832141825196e-05,
"loss": 3.0129,
"step": 4820
},
{
"epoch": 0.35645756457564576,
"grad_norm": 0.49461451172828674,
"learning_rate": 8.12679126369713e-05,
"loss": 3.0726,
"step": 4830
},
{
"epoch": 0.35719557195571955,
"grad_norm": 0.4843361973762512,
"learning_rate": 8.116729634771876e-05,
"loss": 2.9953,
"step": 4840
},
{
"epoch": 0.35793357933579334,
"grad_norm": 0.5127764344215393,
"learning_rate": 8.106647321822943e-05,
"loss": 3.0525,
"step": 4850
},
{
"epoch": 0.3586715867158672,
"grad_norm": 0.4938580393791199,
"learning_rate": 8.096544391761103e-05,
"loss": 2.975,
"step": 4860
},
{
"epoch": 0.359409594095941,
"grad_norm": 0.4944118559360504,
"learning_rate": 8.08642091163396e-05,
"loss": 3.0102,
"step": 4870
},
{
"epoch": 0.36014760147601477,
"grad_norm": 0.4949988126754761,
"learning_rate": 8.076276948625494e-05,
"loss": 2.9756,
"step": 4880
},
{
"epoch": 0.36088560885608856,
"grad_norm": 0.5549206733703613,
"learning_rate": 8.066112570055621e-05,
"loss": 3.0896,
"step": 4890
},
{
"epoch": 0.36162361623616235,
"grad_norm": 0.4933255910873413,
"learning_rate": 8.055927843379738e-05,
"loss": 3.036,
"step": 4900
},
{
"epoch": 0.36236162361623614,
"grad_norm": 0.5120234489440918,
"learning_rate": 8.04572283618829e-05,
"loss": 3.0661,
"step": 4910
},
{
"epoch": 0.36309963099631,
"grad_norm": 0.47579410672187805,
"learning_rate": 8.035497616206302e-05,
"loss": 2.9517,
"step": 4920
},
{
"epoch": 0.3638376383763838,
"grad_norm": 0.47006312012672424,
"learning_rate": 8.025252251292949e-05,
"loss": 2.9931,
"step": 4930
},
{
"epoch": 0.36457564575645757,
"grad_norm": 0.498418927192688,
"learning_rate": 8.014986809441094e-05,
"loss": 2.9749,
"step": 4940
},
{
"epoch": 0.36531365313653136,
"grad_norm": 0.4772182106971741,
"learning_rate": 8.00470135877684e-05,
"loss": 2.9708,
"step": 4950
},
{
"epoch": 0.36605166051660515,
"grad_norm": 0.47467556595802307,
"learning_rate": 7.994395967559076e-05,
"loss": 2.9898,
"step": 4960
},
{
"epoch": 0.36678966789667894,
"grad_norm": 0.509661078453064,
"learning_rate": 7.984070704179026e-05,
"loss": 3.0238,
"step": 4970
},
{
"epoch": 0.3675276752767528,
"grad_norm": 0.47225892543792725,
"learning_rate": 7.973725637159794e-05,
"loss": 3.0066,
"step": 4980
},
{
"epoch": 0.3682656826568266,
"grad_norm": 0.5211546421051025,
"learning_rate": 7.963360835155915e-05,
"loss": 3.0896,
"step": 4990
},
{
"epoch": 0.36900369003690037,
"grad_norm": 0.4817075729370117,
"learning_rate": 7.952976366952888e-05,
"loss": 3.0348,
"step": 5000
},
{
"epoch": 0.36974169741697416,
"grad_norm": 0.4747537672519684,
"learning_rate": 7.942572301466727e-05,
"loss": 3.0146,
"step": 5010
},
{
"epoch": 0.37047970479704795,
"grad_norm": 0.5026445984840393,
"learning_rate": 7.932148707743503e-05,
"loss": 2.9681,
"step": 5020
},
{
"epoch": 0.3712177121771218,
"grad_norm": 0.47187340259552,
"learning_rate": 7.921705654958886e-05,
"loss": 3.0161,
"step": 5030
},
{
"epoch": 0.3719557195571956,
"grad_norm": 0.5039234161376953,
"learning_rate": 7.911243212417687e-05,
"loss": 3.0002,
"step": 5040
},
{
"epoch": 0.3726937269372694,
"grad_norm": 0.481448233127594,
"learning_rate": 7.900761449553394e-05,
"loss": 2.9907,
"step": 5050
},
{
"epoch": 0.37343173431734317,
"grad_norm": 0.4844491481781006,
"learning_rate": 7.890260435927708e-05,
"loss": 3.0501,
"step": 5060
},
{
"epoch": 0.37416974169741696,
"grad_norm": 0.502325177192688,
"learning_rate": 7.879740241230098e-05,
"loss": 2.9843,
"step": 5070
},
{
"epoch": 0.37490774907749075,
"grad_norm": 0.49289822578430176,
"learning_rate": 7.869200935277317e-05,
"loss": 2.9808,
"step": 5080
},
{
"epoch": 0.3756457564575646,
"grad_norm": 0.4960924983024597,
"learning_rate": 7.858642588012957e-05,
"loss": 3.0367,
"step": 5090
},
{
"epoch": 0.3763837638376384,
"grad_norm": 0.4961390495300293,
"learning_rate": 7.848065269506968e-05,
"loss": 3.0371,
"step": 5100
},
{
"epoch": 0.3771217712177122,
"grad_norm": 0.5095449090003967,
"learning_rate": 7.837469049955211e-05,
"loss": 2.9584,
"step": 5110
},
{
"epoch": 0.37785977859778597,
"grad_norm": 0.5364798307418823,
"learning_rate": 7.826853999678979e-05,
"loss": 3.0194,
"step": 5120
},
{
"epoch": 0.37859778597785976,
"grad_norm": 0.47735193371772766,
"learning_rate": 7.816220189124526e-05,
"loss": 2.9603,
"step": 5130
},
{
"epoch": 0.3793357933579336,
"grad_norm": 0.47760894894599915,
"learning_rate": 7.805567688862626e-05,
"loss": 3.0335,
"step": 5140
},
{
"epoch": 0.3800738007380074,
"grad_norm": 0.4874935448169708,
"learning_rate": 7.794896569588066e-05,
"loss": 3.0274,
"step": 5150
},
{
"epoch": 0.3808118081180812,
"grad_norm": 0.48565617203712463,
"learning_rate": 7.784206902119213e-05,
"loss": 3.0081,
"step": 5160
},
{
"epoch": 0.381549815498155,
"grad_norm": 0.513862133026123,
"learning_rate": 7.773498757397522e-05,
"loss": 2.9605,
"step": 5170
},
{
"epoch": 0.38228782287822877,
"grad_norm": 0.4750123918056488,
"learning_rate": 7.762772206487066e-05,
"loss": 3.0109,
"step": 5180
},
{
"epoch": 0.38302583025830256,
"grad_norm": 0.4761565327644348,
"learning_rate": 7.75202732057408e-05,
"loss": 3.0137,
"step": 5190
},
{
"epoch": 0.3837638376383764,
"grad_norm": 0.5001286864280701,
"learning_rate": 7.741264170966472e-05,
"loss": 3.0493,
"step": 5200
},
{
"epoch": 0.3845018450184502,
"grad_norm": 0.48891499638557434,
"learning_rate": 7.730482829093358e-05,
"loss": 3.0333,
"step": 5210
},
{
"epoch": 0.385239852398524,
"grad_norm": 0.4714498221874237,
"learning_rate": 7.719683366504586e-05,
"loss": 2.9868,
"step": 5220
},
{
"epoch": 0.3859778597785978,
"grad_norm": 0.4761471748352051,
"learning_rate": 7.708865854870258e-05,
"loss": 3.0351,
"step": 5230
},
{
"epoch": 0.38671586715867157,
"grad_norm": 0.47278621792793274,
"learning_rate": 7.698030365980265e-05,
"loss": 3.0056,
"step": 5240
},
{
"epoch": 0.3874538745387454,
"grad_norm": 0.502041220664978,
"learning_rate": 7.687176971743796e-05,
"loss": 3.013,
"step": 5250
},
{
"epoch": 0.3881918819188192,
"grad_norm": 0.4808847904205322,
"learning_rate": 7.676305744188871e-05,
"loss": 3.0363,
"step": 5260
},
{
"epoch": 0.388929889298893,
"grad_norm": 0.4782809615135193,
"learning_rate": 7.665416755461859e-05,
"loss": 2.9693,
"step": 5270
},
{
"epoch": 0.3896678966789668,
"grad_norm": 0.4984862804412842,
"learning_rate": 7.654510077827003e-05,
"loss": 2.9882,
"step": 5280
},
{
"epoch": 0.3904059040590406,
"grad_norm": 0.48033297061920166,
"learning_rate": 7.643585783665931e-05,
"loss": 2.9822,
"step": 5290
},
{
"epoch": 0.39114391143911437,
"grad_norm": 0.5328406691551208,
"learning_rate": 7.632643945477193e-05,
"loss": 2.9835,
"step": 5300
},
{
"epoch": 0.3918819188191882,
"grad_norm": 0.4741387963294983,
"learning_rate": 7.621684635875756e-05,
"loss": 3.0095,
"step": 5310
},
{
"epoch": 0.392619926199262,
"grad_norm": 0.8941669464111328,
"learning_rate": 7.610707927592549e-05,
"loss": 2.9642,
"step": 5320
},
{
"epoch": 0.3933579335793358,
"grad_norm": 0.501148521900177,
"learning_rate": 7.59971389347395e-05,
"loss": 2.9973,
"step": 5330
},
{
"epoch": 0.3940959409594096,
"grad_norm": 0.4852311611175537,
"learning_rate": 7.588702606481337e-05,
"loss": 3.019,
"step": 5340
},
{
"epoch": 0.3948339483394834,
"grad_norm": 0.44878798723220825,
"learning_rate": 7.577674139690572e-05,
"loss": 2.9582,
"step": 5350
},
{
"epoch": 0.3955719557195572,
"grad_norm": 0.4837028384208679,
"learning_rate": 7.566628566291536e-05,
"loss": 2.9865,
"step": 5360
},
{
"epoch": 0.396309963099631,
"grad_norm": 0.5781135559082031,
"learning_rate": 7.555565959587638e-05,
"loss": 2.9709,
"step": 5370
},
{
"epoch": 0.3970479704797048,
"grad_norm": 0.4646313786506653,
"learning_rate": 7.544486392995324e-05,
"loss": 3.0123,
"step": 5380
},
{
"epoch": 0.3977859778597786,
"grad_norm": 0.45897990465164185,
"learning_rate": 7.533389940043598e-05,
"loss": 2.9744,
"step": 5390
},
{
"epoch": 0.3985239852398524,
"grad_norm": 0.47609013319015503,
"learning_rate": 7.522276674373525e-05,
"loss": 2.9654,
"step": 5400
},
{
"epoch": 0.3992619926199262,
"grad_norm": 0.48847806453704834,
"learning_rate": 7.51114666973775e-05,
"loss": 3.0279,
"step": 5410
},
{
"epoch": 0.4,
"grad_norm": 0.5017388463020325,
"learning_rate": 7.500000000000001e-05,
"loss": 2.9632,
"step": 5420
},
{
"epoch": 0.4007380073800738,
"grad_norm": 0.49840694665908813,
"learning_rate": 7.488836739134608e-05,
"loss": 3.0054,
"step": 5430
},
{
"epoch": 0.4014760147601476,
"grad_norm": 0.48498594760894775,
"learning_rate": 7.477656961226007e-05,
"loss": 2.9744,
"step": 5440
},
{
"epoch": 0.4022140221402214,
"grad_norm": 0.49641212821006775,
"learning_rate": 7.466460740468245e-05,
"loss": 3.0054,
"step": 5450
},
{
"epoch": 0.4029520295202952,
"grad_norm": 0.47951868176460266,
"learning_rate": 7.455248151164493e-05,
"loss": 2.9506,
"step": 5460
},
{
"epoch": 0.40369003690036903,
"grad_norm": 0.5073153972625732,
"learning_rate": 7.444019267726553e-05,
"loss": 2.9172,
"step": 5470
},
{
"epoch": 0.4044280442804428,
"grad_norm": 0.48473188281059265,
"learning_rate": 7.432774164674359e-05,
"loss": 2.9388,
"step": 5480
},
{
"epoch": 0.4051660516605166,
"grad_norm": 0.4775610566139221,
"learning_rate": 7.421512916635485e-05,
"loss": 3.0088,
"step": 5490
},
{
"epoch": 0.4059040590405904,
"grad_norm": 0.5261042714118958,
"learning_rate": 7.410235598344657e-05,
"loss": 2.9721,
"step": 5500
},
{
"epoch": 0.4066420664206642,
"grad_norm": 0.45107316970825195,
"learning_rate": 7.398942284643241e-05,
"loss": 2.9521,
"step": 5510
},
{
"epoch": 0.407380073800738,
"grad_norm": 0.46772444248199463,
"learning_rate": 7.387633050478766e-05,
"loss": 2.9259,
"step": 5520
},
{
"epoch": 0.40811808118081183,
"grad_norm": 0.4604153633117676,
"learning_rate": 7.376307970904408e-05,
"loss": 3.082,
"step": 5530
},
{
"epoch": 0.4088560885608856,
"grad_norm": 0.47096291184425354,
"learning_rate": 7.364967121078502e-05,
"loss": 2.9186,
"step": 5540
},
{
"epoch": 0.4095940959409594,
"grad_norm": 0.4761073589324951,
"learning_rate": 7.353610576264045e-05,
"loss": 3.028,
"step": 5550
},
{
"epoch": 0.4103321033210332,
"grad_norm": 0.5043940544128418,
"learning_rate": 7.34223841182819e-05,
"loss": 2.9259,
"step": 5560
},
{
"epoch": 0.411070110701107,
"grad_norm": 0.48511525988578796,
"learning_rate": 7.33085070324175e-05,
"loss": 2.9453,
"step": 5570
},
{
"epoch": 0.4118081180811808,
"grad_norm": 0.4717444181442261,
"learning_rate": 7.319447526078696e-05,
"loss": 3.0091,
"step": 5580
},
{
"epoch": 0.41254612546125463,
"grad_norm": 0.44939619302749634,
"learning_rate": 7.308028956015653e-05,
"loss": 2.9809,
"step": 5590
},
{
"epoch": 0.4132841328413284,
"grad_norm": 0.46631982922554016,
"learning_rate": 7.296595068831406e-05,
"loss": 2.9969,
"step": 5600
},
{
"epoch": 0.4140221402214022,
"grad_norm": 0.4884931743144989,
"learning_rate": 7.285145940406386e-05,
"loss": 2.9521,
"step": 5610
},
{
"epoch": 0.414760147601476,
"grad_norm": 0.4892655611038208,
"learning_rate": 7.273681646722173e-05,
"loss": 2.9666,
"step": 5620
},
{
"epoch": 0.4154981549815498,
"grad_norm": 0.4869326651096344,
"learning_rate": 7.262202263860988e-05,
"loss": 2.9618,
"step": 5630
},
{
"epoch": 0.41623616236162364,
"grad_norm": 0.48076122999191284,
"learning_rate": 7.2507078680052e-05,
"loss": 2.9113,
"step": 5640
},
{
"epoch": 0.41697416974169743,
"grad_norm": 0.46369293332099915,
"learning_rate": 7.239198535436801e-05,
"loss": 2.9309,
"step": 5650
},
{
"epoch": 0.4177121771217712,
"grad_norm": 0.49062806367874146,
"learning_rate": 7.227674342536913e-05,
"loss": 3.0057,
"step": 5660
},
{
"epoch": 0.418450184501845,
"grad_norm": 0.4727836847305298,
"learning_rate": 7.216135365785279e-05,
"loss": 3.0034,
"step": 5670
},
{
"epoch": 0.4191881918819188,
"grad_norm": 0.5185651779174805,
"learning_rate": 7.20458168175975e-05,
"loss": 2.9296,
"step": 5680
},
{
"epoch": 0.4199261992619926,
"grad_norm": 0.4758572280406952,
"learning_rate": 7.193013367135792e-05,
"loss": 2.9805,
"step": 5690
},
{
"epoch": 0.42066420664206644,
"grad_norm": 0.507834255695343,
"learning_rate": 7.181430498685954e-05,
"loss": 2.9829,
"step": 5700
},
{
"epoch": 0.42140221402214023,
"grad_norm": 0.48527729511260986,
"learning_rate": 7.169833153279375e-05,
"loss": 2.9951,
"step": 5710
},
{
"epoch": 0.422140221402214,
"grad_norm": 0.5018925070762634,
"learning_rate": 7.158221407881272e-05,
"loss": 3.0251,
"step": 5720
},
{
"epoch": 0.4228782287822878,
"grad_norm": 0.5182327032089233,
"learning_rate": 7.146595339552422e-05,
"loss": 2.9954,
"step": 5730
},
{
"epoch": 0.4236162361623616,
"grad_norm": 0.5015000104904175,
"learning_rate": 7.134955025448663e-05,
"loss": 2.9285,
"step": 5740
},
{
"epoch": 0.42435424354243545,
"grad_norm": 0.47007137537002563,
"learning_rate": 7.123300542820366e-05,
"loss": 2.923,
"step": 5750
},
{
"epoch": 0.42509225092250924,
"grad_norm": 0.4987011253833771,
"learning_rate": 7.111631969011938e-05,
"loss": 2.9555,
"step": 5760
},
{
"epoch": 0.42583025830258303,
"grad_norm": 0.4811478853225708,
"learning_rate": 7.099949381461296e-05,
"loss": 2.9797,
"step": 5770
},
{
"epoch": 0.4265682656826568,
"grad_norm": 0.4753568470478058,
"learning_rate": 7.08825285769936e-05,
"loss": 2.9137,
"step": 5780
},
{
"epoch": 0.4273062730627306,
"grad_norm": 0.46175628900527954,
"learning_rate": 7.076542475349537e-05,
"loss": 2.9291,
"step": 5790
},
{
"epoch": 0.4280442804428044,
"grad_norm": 0.5033062696456909,
"learning_rate": 7.06481831212721e-05,
"loss": 2.9927,
"step": 5800
},
{
"epoch": 0.42878228782287825,
"grad_norm": 0.4942483603954315,
"learning_rate": 7.05308044583921e-05,
"loss": 2.8999,
"step": 5810
},
{
"epoch": 0.42952029520295204,
"grad_norm": 0.46212270855903625,
"learning_rate": 7.041328954383316e-05,
"loss": 2.9618,
"step": 5820
},
{
"epoch": 0.43025830258302583,
"grad_norm": 0.4895878732204437,
"learning_rate": 7.029563915747722e-05,
"loss": 3.0415,
"step": 5830
},
{
"epoch": 0.4309963099630996,
"grad_norm": 0.48732495307922363,
"learning_rate": 7.017785408010533e-05,
"loss": 2.9275,
"step": 5840
},
{
"epoch": 0.4317343173431734,
"grad_norm": 0.49087876081466675,
"learning_rate": 7.005993509339241e-05,
"loss": 2.981,
"step": 5850
},
{
"epoch": 0.43247232472324726,
"grad_norm": 0.5266060829162598,
"learning_rate": 6.9941882979902e-05,
"loss": 2.8859,
"step": 5860
},
{
"epoch": 0.43321033210332105,
"grad_norm": 0.45862722396850586,
"learning_rate": 6.982369852308124e-05,
"loss": 2.9225,
"step": 5870
},
{
"epoch": 0.43394833948339484,
"grad_norm": 0.5097654461860657,
"learning_rate": 6.97053825072554e-05,
"loss": 2.9179,
"step": 5880
},
{
"epoch": 0.43468634686346863,
"grad_norm": 0.5156700611114502,
"learning_rate": 6.958693571762301e-05,
"loss": 3.0092,
"step": 5890
},
{
"epoch": 0.4354243542435424,
"grad_norm": 0.4698309898376465,
"learning_rate": 6.946835894025037e-05,
"loss": 2.8776,
"step": 5900
},
{
"epoch": 0.4361623616236162,
"grad_norm": 0.4787076711654663,
"learning_rate": 6.934965296206645e-05,
"loss": 2.9759,
"step": 5910
},
{
"epoch": 0.43690036900369006,
"grad_norm": 0.4753543734550476,
"learning_rate": 6.923081857085766e-05,
"loss": 3.0012,
"step": 5920
},
{
"epoch": 0.43763837638376385,
"grad_norm": 0.4781608283519745,
"learning_rate": 6.911185655526263e-05,
"loss": 2.9636,
"step": 5930
},
{
"epoch": 0.43837638376383764,
"grad_norm": 0.46679866313934326,
"learning_rate": 6.899276770476695e-05,
"loss": 2.9666,
"step": 5940
},
{
"epoch": 0.43911439114391143,
"grad_norm": 0.4817095100879669,
"learning_rate": 6.887355280969796e-05,
"loss": 2.9268,
"step": 5950
},
{
"epoch": 0.4398523985239852,
"grad_norm": 0.46391561627388,
"learning_rate": 6.875421266121946e-05,
"loss": 2.9796,
"step": 5960
},
{
"epoch": 0.44059040590405907,
"grad_norm": 0.4704035222530365,
"learning_rate": 6.86347480513265e-05,
"loss": 2.93,
"step": 5970
},
{
"epoch": 0.44132841328413286,
"grad_norm": 0.5005739331245422,
"learning_rate": 6.851515977284013e-05,
"loss": 2.9329,
"step": 5980
},
{
"epoch": 0.44206642066420665,
"grad_norm": 0.5069407224655151,
"learning_rate": 6.839544861940214e-05,
"loss": 3.0269,
"step": 5990
},
{
"epoch": 0.44280442804428044,
"grad_norm": 0.4672479033470154,
"learning_rate": 6.827561538546967e-05,
"loss": 2.9522,
"step": 6000
},
{
"epoch": 0.44354243542435423,
"grad_norm": 0.4877452850341797,
"learning_rate": 6.815566086631016e-05,
"loss": 2.9381,
"step": 6010
},
{
"epoch": 0.444280442804428,
"grad_norm": 0.4852764308452606,
"learning_rate": 6.80355858579959e-05,
"loss": 2.9431,
"step": 6020
},
{
"epoch": 0.44501845018450187,
"grad_norm": 0.4775632321834564,
"learning_rate": 6.791539115739879e-05,
"loss": 2.9923,
"step": 6030
},
{
"epoch": 0.44575645756457566,
"grad_norm": 0.48804882168769836,
"learning_rate": 6.779507756218509e-05,
"loss": 3.0321,
"step": 6040
},
{
"epoch": 0.44649446494464945,
"grad_norm": 0.4770827293395996,
"learning_rate": 6.76746458708101e-05,
"loss": 3.0004,
"step": 6050
},
{
"epoch": 0.44723247232472324,
"grad_norm": 0.47312870621681213,
"learning_rate": 6.75540968825128e-05,
"loss": 2.9975,
"step": 6060
},
{
"epoch": 0.44797047970479703,
"grad_norm": 0.48013314604759216,
"learning_rate": 6.74334313973107e-05,
"loss": 2.9666,
"step": 6070
},
{
"epoch": 0.4487084870848708,
"grad_norm": 0.4521431624889374,
"learning_rate": 6.731265021599436e-05,
"loss": 2.8592,
"step": 6080
},
{
"epoch": 0.44944649446494467,
"grad_norm": 0.4653100073337555,
"learning_rate": 6.719175414012219e-05,
"loss": 2.9367,
"step": 6090
},
{
"epoch": 0.45018450184501846,
"grad_norm": 0.5198903679847717,
"learning_rate": 6.707074397201508e-05,
"loss": 3.014,
"step": 6100
},
{
"epoch": 0.45092250922509225,
"grad_norm": 0.4655381441116333,
"learning_rate": 6.694962051475107e-05,
"loss": 2.9422,
"step": 6110
},
{
"epoch": 0.45166051660516604,
"grad_norm": 0.4614551067352295,
"learning_rate": 6.682838457216009e-05,
"loss": 2.9474,
"step": 6120
},
{
"epoch": 0.45239852398523983,
"grad_norm": 0.4937768876552582,
"learning_rate": 6.67070369488185e-05,
"loss": 2.8953,
"step": 6130
},
{
"epoch": 0.4531365313653137,
"grad_norm": 0.4759802222251892,
"learning_rate": 6.65855784500439e-05,
"loss": 2.9553,
"step": 6140
},
{
"epoch": 0.45387453874538747,
"grad_norm": 0.519924521446228,
"learning_rate": 6.646400988188964e-05,
"loss": 2.8839,
"step": 6150
},
{
"epoch": 0.45461254612546126,
"grad_norm": 0.46175694465637207,
"learning_rate": 6.63423320511396e-05,
"loss": 2.9878,
"step": 6160
},
{
"epoch": 0.45535055350553505,
"grad_norm": 0.48847445845603943,
"learning_rate": 6.622054576530274e-05,
"loss": 2.9601,
"step": 6170
},
{
"epoch": 0.45608856088560884,
"grad_norm": 0.46752119064331055,
"learning_rate": 6.609865183260778e-05,
"loss": 2.9375,
"step": 6180
},
{
"epoch": 0.45682656826568263,
"grad_norm": 0.48789575695991516,
"learning_rate": 6.597665106199783e-05,
"loss": 2.9675,
"step": 6190
},
{
"epoch": 0.4575645756457565,
"grad_norm": 0.46002650260925293,
"learning_rate": 6.585454426312506e-05,
"loss": 2.9194,
"step": 6200
},
{
"epoch": 0.45830258302583027,
"grad_norm": 0.4882054924964905,
"learning_rate": 6.573233224634524e-05,
"loss": 2.931,
"step": 6210
},
{
"epoch": 0.45904059040590406,
"grad_norm": 0.4962427318096161,
"learning_rate": 6.561001582271245e-05,
"loss": 2.9639,
"step": 6220
},
{
"epoch": 0.45977859778597785,
"grad_norm": 0.47860512137413025,
"learning_rate": 6.548759580397363e-05,
"loss": 2.9726,
"step": 6230
},
{
"epoch": 0.46051660516605164,
"grad_norm": 0.4823954701423645,
"learning_rate": 6.536507300256327e-05,
"loss": 2.9363,
"step": 6240
},
{
"epoch": 0.4612546125461255,
"grad_norm": 0.46530622243881226,
"learning_rate": 6.524244823159794e-05,
"loss": 2.9696,
"step": 6250
},
{
"epoch": 0.4619926199261993,
"grad_norm": 0.4861395061016083,
"learning_rate": 6.511972230487091e-05,
"loss": 2.9816,
"step": 6260
},
{
"epoch": 0.46273062730627307,
"grad_norm": 0.47099757194519043,
"learning_rate": 6.499689603684682e-05,
"loss": 2.8812,
"step": 6270
},
{
"epoch": 0.46346863468634686,
"grad_norm": 0.47105422616004944,
"learning_rate": 6.487397024265616e-05,
"loss": 2.8715,
"step": 6280
},
{
"epoch": 0.46420664206642065,
"grad_norm": 0.4647127091884613,
"learning_rate": 6.475094573808993e-05,
"loss": 2.972,
"step": 6290
},
{
"epoch": 0.46494464944649444,
"grad_norm": 0.4713263213634491,
"learning_rate": 6.462782333959429e-05,
"loss": 2.9297,
"step": 6300
},
{
"epoch": 0.4656826568265683,
"grad_norm": 0.4704754650592804,
"learning_rate": 6.450460386426495e-05,
"loss": 2.9489,
"step": 6310
},
{
"epoch": 0.4664206642066421,
"grad_norm": 0.49764499068260193,
"learning_rate": 6.438128812984199e-05,
"loss": 2.8814,
"step": 6320
},
{
"epoch": 0.46715867158671587,
"grad_norm": 0.46612176299095154,
"learning_rate": 6.425787695470419e-05,
"loss": 2.9663,
"step": 6330
},
{
"epoch": 0.46789667896678966,
"grad_norm": 0.46676209568977356,
"learning_rate": 6.41343711578638e-05,
"loss": 2.9843,
"step": 6340
},
{
"epoch": 0.46863468634686345,
"grad_norm": 0.45879995822906494,
"learning_rate": 6.401077155896099e-05,
"loss": 2.8991,
"step": 6350
},
{
"epoch": 0.4693726937269373,
"grad_norm": 0.4595896303653717,
"learning_rate": 6.388707897825846e-05,
"loss": 2.9603,
"step": 6360
},
{
"epoch": 0.4701107011070111,
"grad_norm": 0.47197359800338745,
"learning_rate": 6.376329423663596e-05,
"loss": 3.0058,
"step": 6370
},
{
"epoch": 0.4708487084870849,
"grad_norm": 0.4487576186656952,
"learning_rate": 6.363941815558484e-05,
"loss": 2.9126,
"step": 6380
},
{
"epoch": 0.47158671586715867,
"grad_norm": 0.45560458302497864,
"learning_rate": 6.35154515572027e-05,
"loss": 2.9979,
"step": 6390
},
{
"epoch": 0.47232472324723246,
"grad_norm": 0.4601997435092926,
"learning_rate": 6.339139526418778e-05,
"loss": 2.8166,
"step": 6400
},
{
"epoch": 0.47306273062730625,
"grad_norm": 0.48877766728401184,
"learning_rate": 6.32672500998336e-05,
"loss": 2.8798,
"step": 6410
},
{
"epoch": 0.4738007380073801,
"grad_norm": 0.4835923910140991,
"learning_rate": 6.314301688802347e-05,
"loss": 2.9273,
"step": 6420
},
{
"epoch": 0.4745387453874539,
"grad_norm": 0.465264230966568,
"learning_rate": 6.301869645322498e-05,
"loss": 2.9399,
"step": 6430
},
{
"epoch": 0.4752767527675277,
"grad_norm": 0.49252355098724365,
"learning_rate": 6.289428962048467e-05,
"loss": 2.9608,
"step": 6440
},
{
"epoch": 0.47601476014760147,
"grad_norm": 0.48788875341415405,
"learning_rate": 6.276979721542239e-05,
"loss": 2.9896,
"step": 6450
},
{
"epoch": 0.47675276752767526,
"grad_norm": 0.4745902121067047,
"learning_rate": 6.264522006422586e-05,
"loss": 2.9076,
"step": 6460
},
{
"epoch": 0.4774907749077491,
"grad_norm": 0.47580885887145996,
"learning_rate": 6.252055899364525e-05,
"loss": 2.899,
"step": 6470
},
{
"epoch": 0.4782287822878229,
"grad_norm": 0.47672221064567566,
"learning_rate": 6.239581483098766e-05,
"loss": 2.9338,
"step": 6480
},
{
"epoch": 0.4789667896678967,
"grad_norm": 0.46901679039001465,
"learning_rate": 6.227098840411166e-05,
"loss": 2.9081,
"step": 6490
},
{
"epoch": 0.4797047970479705,
"grad_norm": 0.45821747183799744,
"learning_rate": 6.214608054142167e-05,
"loss": 2.9717,
"step": 6500
},
{
"epoch": 0.48044280442804427,
"grad_norm": 0.457815945148468,
"learning_rate": 6.202109207186263e-05,
"loss": 2.9594,
"step": 6510
},
{
"epoch": 0.48118081180811806,
"grad_norm": 0.45802658796310425,
"learning_rate": 6.189602382491439e-05,
"loss": 2.958,
"step": 6520
},
{
"epoch": 0.4819188191881919,
"grad_norm": 0.47702470421791077,
"learning_rate": 6.177087663058626e-05,
"loss": 2.9481,
"step": 6530
},
{
"epoch": 0.4826568265682657,
"grad_norm": 0.4765585660934448,
"learning_rate": 6.164565131941147e-05,
"loss": 2.9139,
"step": 6540
},
{
"epoch": 0.4833948339483395,
"grad_norm": 0.49875739216804504,
"learning_rate": 6.152034872244166e-05,
"loss": 2.9726,
"step": 6550
},
{
"epoch": 0.4841328413284133,
"grad_norm": 0.46083393692970276,
"learning_rate": 6.13949696712414e-05,
"loss": 2.9462,
"step": 6560
},
{
"epoch": 0.48487084870848707,
"grad_norm": 0.4647446274757385,
"learning_rate": 6.126951499788261e-05,
"loss": 2.9349,
"step": 6570
},
{
"epoch": 0.48560885608856086,
"grad_norm": 0.4930126667022705,
"learning_rate": 6.114398553493908e-05,
"loss": 2.9763,
"step": 6580
},
{
"epoch": 0.4863468634686347,
"grad_norm": 0.4873722791671753,
"learning_rate": 6.1018382115480985e-05,
"loss": 2.9322,
"step": 6590
},
{
"epoch": 0.4870848708487085,
"grad_norm": 0.4486652910709381,
"learning_rate": 6.089270557306923e-05,
"loss": 2.8796,
"step": 6600
},
{
"epoch": 0.4878228782287823,
"grad_norm": 0.482166588306427,
"learning_rate": 6.076695674175007e-05,
"loss": 2.9542,
"step": 6610
},
{
"epoch": 0.4885608856088561,
"grad_norm": 0.4913167953491211,
"learning_rate": 6.0641136456049454e-05,
"loss": 3.0476,
"step": 6620
},
{
"epoch": 0.48929889298892987,
"grad_norm": 0.4978322982788086,
"learning_rate": 6.051524555096754e-05,
"loss": 2.8936,
"step": 6630
},
{
"epoch": 0.4900369003690037,
"grad_norm": 0.4421325922012329,
"learning_rate": 6.038928486197316e-05,
"loss": 2.9131,
"step": 6640
},
{
"epoch": 0.4907749077490775,
"grad_norm": 0.4662306308746338,
"learning_rate": 6.02632552249983e-05,
"loss": 2.8394,
"step": 6650
},
{
"epoch": 0.4915129151291513,
"grad_norm": 0.5267830491065979,
"learning_rate": 6.0137157476432424e-05,
"loss": 2.8703,
"step": 6660
},
{
"epoch": 0.4922509225092251,
"grad_norm": 0.509088397026062,
"learning_rate": 6.001099245311711e-05,
"loss": 2.9691,
"step": 6670
},
{
"epoch": 0.4929889298892989,
"grad_norm": 0.46723711490631104,
"learning_rate": 5.988476099234033e-05,
"loss": 2.9496,
"step": 6680
},
{
"epoch": 0.49372693726937267,
"grad_norm": 0.4566686153411865,
"learning_rate": 5.975846393183101e-05,
"loss": 2.8571,
"step": 6690
},
{
"epoch": 0.4944649446494465,
"grad_norm": 0.4769027829170227,
"learning_rate": 5.963210210975343e-05,
"loss": 2.898,
"step": 6700
},
{
"epoch": 0.4952029520295203,
"grad_norm": 0.4787648320198059,
"learning_rate": 5.95056763647016e-05,
"loss": 2.9649,
"step": 6710
},
{
"epoch": 0.4959409594095941,
"grad_norm": 0.45179930329322815,
"learning_rate": 5.9379187535693804e-05,
"loss": 2.9201,
"step": 6720
},
{
"epoch": 0.4966789667896679,
"grad_norm": 0.4381027817726135,
"learning_rate": 5.925263646216697e-05,
"loss": 2.9402,
"step": 6730
},
{
"epoch": 0.4974169741697417,
"grad_norm": 0.49445804953575134,
"learning_rate": 5.912602398397111e-05,
"loss": 2.9305,
"step": 6740
},
{
"epoch": 0.4981549815498155,
"grad_norm": 0.4826495349407196,
"learning_rate": 5.8999350941363726e-05,
"loss": 2.9346,
"step": 6750
},
{
"epoch": 0.4988929889298893,
"grad_norm": 0.4974125921726227,
"learning_rate": 5.887261817500427e-05,
"loss": 2.9743,
"step": 6760
},
{
"epoch": 0.4996309963099631,
"grad_norm": 0.47447288036346436,
"learning_rate": 5.874582652594854e-05,
"loss": 2.9399,
"step": 6770
},
{
"epoch": 0.5003690036900369,
"grad_norm": 0.48605871200561523,
"learning_rate": 5.861897683564312e-05,
"loss": 2.9667,
"step": 6780
},
{
"epoch": 0.5011070110701107,
"grad_norm": 0.4562762379646301,
"learning_rate": 5.849206994591976e-05,
"loss": 2.9355,
"step": 6790
},
{
"epoch": 0.5018450184501845,
"grad_norm": 0.4724028706550598,
"learning_rate": 5.8365106698989834e-05,
"loss": 2.8938,
"step": 6800
},
{
"epoch": 0.5025830258302583,
"grad_norm": 0.4404136538505554,
"learning_rate": 5.82380879374387e-05,
"loss": 2.8332,
"step": 6810
},
{
"epoch": 0.5033210332103321,
"grad_norm": 0.4685560464859009,
"learning_rate": 5.8111014504220165e-05,
"loss": 2.9792,
"step": 6820
},
{
"epoch": 0.5040590405904058,
"grad_norm": 0.47112590074539185,
"learning_rate": 5.7983887242650846e-05,
"loss": 2.9933,
"step": 6830
},
{
"epoch": 0.5047970479704798,
"grad_norm": 0.46272197365760803,
"learning_rate": 5.78567069964046e-05,
"loss": 2.9916,
"step": 6840
},
{
"epoch": 0.5055350553505535,
"grad_norm": 0.47110989689826965,
"learning_rate": 5.772947460950688e-05,
"loss": 2.8869,
"step": 6850
},
{
"epoch": 0.5062730627306273,
"grad_norm": 0.47916916012763977,
"learning_rate": 5.760219092632924e-05,
"loss": 2.9576,
"step": 6860
},
{
"epoch": 0.5070110701107011,
"grad_norm": 0.47247427701950073,
"learning_rate": 5.7474856791583576e-05,
"loss": 2.9433,
"step": 6870
},
{
"epoch": 0.5077490774907749,
"grad_norm": 0.4856591820716858,
"learning_rate": 5.7347473050316636e-05,
"loss": 2.983,
"step": 6880
},
{
"epoch": 0.5084870848708487,
"grad_norm": 0.4498710036277771,
"learning_rate": 5.722004054790442e-05,
"loss": 2.95,
"step": 6890
},
{
"epoch": 0.5092250922509225,
"grad_norm": 0.4407157003879547,
"learning_rate": 5.7092560130046466e-05,
"loss": 2.9004,
"step": 6900
},
{
"epoch": 0.5099630996309963,
"grad_norm": 0.4676019847393036,
"learning_rate": 5.696503264276035e-05,
"loss": 2.8584,
"step": 6910
},
{
"epoch": 0.5107011070110701,
"grad_norm": 0.44521570205688477,
"learning_rate": 5.683745893237597e-05,
"loss": 2.9214,
"step": 6920
},
{
"epoch": 0.5114391143911439,
"grad_norm": 0.4693831503391266,
"learning_rate": 5.670983984553003e-05,
"loss": 2.9721,
"step": 6930
},
{
"epoch": 0.5121771217712177,
"grad_norm": 0.43683314323425293,
"learning_rate": 5.6582176229160355e-05,
"loss": 2.8837,
"step": 6940
},
{
"epoch": 0.5129151291512916,
"grad_norm": 0.4462457299232483,
"learning_rate": 5.645446893050029e-05,
"loss": 2.8014,
"step": 6950
},
{
"epoch": 0.5136531365313654,
"grad_norm": 0.46673473715782166,
"learning_rate": 5.632671879707307e-05,
"loss": 2.8542,
"step": 6960
},
{
"epoch": 0.5143911439114391,
"grad_norm": 0.5018209218978882,
"learning_rate": 5.619892667668618e-05,
"loss": 2.9344,
"step": 6970
},
{
"epoch": 0.5151291512915129,
"grad_norm": 0.4942212700843811,
"learning_rate": 5.607109341742579e-05,
"loss": 2.9002,
"step": 6980
},
{
"epoch": 0.5158671586715867,
"grad_norm": 0.4789501428604126,
"learning_rate": 5.5943219867651086e-05,
"loss": 2.8955,
"step": 6990
},
{
"epoch": 0.5166051660516605,
"grad_norm": 0.44573846459388733,
"learning_rate": 5.58153068759886e-05,
"loss": 2.9184,
"step": 7000
},
{
"epoch": 0.5173431734317343,
"grad_norm": 0.4906388819217682,
"learning_rate": 5.568735529132665e-05,
"loss": 2.9369,
"step": 7010
},
{
"epoch": 0.5180811808118081,
"grad_norm": 0.44844797253608704,
"learning_rate": 5.555936596280966e-05,
"loss": 2.9435,
"step": 7020
},
{
"epoch": 0.5188191881918819,
"grad_norm": 0.46517252922058105,
"learning_rate": 5.5431339739832545e-05,
"loss": 2.9933,
"step": 7030
},
{
"epoch": 0.5195571955719557,
"grad_norm": 0.4549432396888733,
"learning_rate": 5.530327747203506e-05,
"loss": 2.8739,
"step": 7040
},
{
"epoch": 0.5202952029520295,
"grad_norm": 0.47701096534729004,
"learning_rate": 5.51751800092962e-05,
"loss": 2.9088,
"step": 7050
},
{
"epoch": 0.5210332103321034,
"grad_norm": 0.489654541015625,
"learning_rate": 5.50470482017285e-05,
"loss": 2.9574,
"step": 7060
},
{
"epoch": 0.5217712177121772,
"grad_norm": 0.4661862850189209,
"learning_rate": 5.491888289967241e-05,
"loss": 2.9482,
"step": 7070
},
{
"epoch": 0.522509225092251,
"grad_norm": 0.446463406085968,
"learning_rate": 5.4790684953690706e-05,
"loss": 2.9176,
"step": 7080
},
{
"epoch": 0.5232472324723247,
"grad_norm": 0.4751204550266266,
"learning_rate": 5.466245521456278e-05,
"loss": 2.924,
"step": 7090
},
{
"epoch": 0.5239852398523985,
"grad_norm": 0.5041395425796509,
"learning_rate": 5.4534194533279e-05,
"loss": 2.8624,
"step": 7100
},
{
"epoch": 0.5247232472324723,
"grad_norm": 0.4631516635417938,
"learning_rate": 5.4405903761035124e-05,
"loss": 2.9072,
"step": 7110
},
{
"epoch": 0.5254612546125461,
"grad_norm": 0.45753976702690125,
"learning_rate": 5.427758374922658e-05,
"loss": 2.9332,
"step": 7120
},
{
"epoch": 0.5261992619926199,
"grad_norm": 0.4684479236602783,
"learning_rate": 5.414923534944283e-05,
"loss": 2.9017,
"step": 7130
},
{
"epoch": 0.5269372693726937,
"grad_norm": 0.46777448058128357,
"learning_rate": 5.4020859413461756e-05,
"loss": 2.9231,
"step": 7140
},
{
"epoch": 0.5276752767527675,
"grad_norm": 0.47089943289756775,
"learning_rate": 5.389245679324398e-05,
"loss": 2.9215,
"step": 7150
},
{
"epoch": 0.5284132841328413,
"grad_norm": 0.44447311758995056,
"learning_rate": 5.376402834092721e-05,
"loss": 2.9281,
"step": 7160
},
{
"epoch": 0.5291512915129152,
"grad_norm": 0.47463953495025635,
"learning_rate": 5.363557490882057e-05,
"loss": 2.947,
"step": 7170
},
{
"epoch": 0.529889298892989,
"grad_norm": 0.47504737973213196,
"learning_rate": 5.350709734939897e-05,
"loss": 3.0103,
"step": 7180
},
{
"epoch": 0.5306273062730628,
"grad_norm": 0.472151517868042,
"learning_rate": 5.337859651529746e-05,
"loss": 2.966,
"step": 7190
},
{
"epoch": 0.5313653136531366,
"grad_norm": 0.44552987813949585,
"learning_rate": 5.325007325930554e-05,
"loss": 2.8962,
"step": 7200
},
{
"epoch": 0.5321033210332103,
"grad_norm": 0.487582266330719,
"learning_rate": 5.3121528434361524e-05,
"loss": 2.9548,
"step": 7210
},
{
"epoch": 0.5328413284132841,
"grad_norm": 0.47288230061531067,
"learning_rate": 5.299296289354681e-05,
"loss": 2.8969,
"step": 7220
},
{
"epoch": 0.5335793357933579,
"grad_norm": 0.4963250756263733,
"learning_rate": 5.2864377490080306e-05,
"loss": 2.9785,
"step": 7230
},
{
"epoch": 0.5343173431734317,
"grad_norm": 0.4519381821155548,
"learning_rate": 5.2735773077312814e-05,
"loss": 2.9112,
"step": 7240
},
{
"epoch": 0.5350553505535055,
"grad_norm": 0.47766226530075073,
"learning_rate": 5.2607150508721195e-05,
"loss": 2.8749,
"step": 7250
},
{
"epoch": 0.5357933579335793,
"grad_norm": 0.4712168872356415,
"learning_rate": 5.24785106379028e-05,
"loss": 2.9148,
"step": 7260
},
{
"epoch": 0.5365313653136531,
"grad_norm": 0.44543230533599854,
"learning_rate": 5.234985431856988e-05,
"loss": 2.9281,
"step": 7270
},
{
"epoch": 0.537269372693727,
"grad_norm": 0.46235865354537964,
"learning_rate": 5.2221182404543754e-05,
"loss": 2.9294,
"step": 7280
},
{
"epoch": 0.5380073800738008,
"grad_norm": 0.4579477608203888,
"learning_rate": 5.2092495749749346e-05,
"loss": 2.9286,
"step": 7290
},
{
"epoch": 0.5387453874538746,
"grad_norm": 0.4533149302005768,
"learning_rate": 5.196379520820929e-05,
"loss": 2.9063,
"step": 7300
},
{
"epoch": 0.5394833948339484,
"grad_norm": 0.48128604888916016,
"learning_rate": 5.183508163403845e-05,
"loss": 2.8985,
"step": 7310
},
{
"epoch": 0.5402214022140222,
"grad_norm": 0.46598076820373535,
"learning_rate": 5.170635588143816e-05,
"loss": 2.9074,
"step": 7320
},
{
"epoch": 0.5409594095940959,
"grad_norm": 0.4706079363822937,
"learning_rate": 5.157761880469058e-05,
"loss": 2.9216,
"step": 7330
},
{
"epoch": 0.5416974169741697,
"grad_norm": 0.45854324102401733,
"learning_rate": 5.144887125815301e-05,
"loss": 2.9771,
"step": 7340
},
{
"epoch": 0.5424354243542435,
"grad_norm": 0.4575222134590149,
"learning_rate": 5.132011409625224e-05,
"loss": 2.878,
"step": 7350
},
{
"epoch": 0.5431734317343173,
"grad_norm": 0.45603683590888977,
"learning_rate": 5.1191348173478884e-05,
"loss": 2.9328,
"step": 7360
},
{
"epoch": 0.5439114391143911,
"grad_norm": 0.47662872076034546,
"learning_rate": 5.1062574344381686e-05,
"loss": 2.9483,
"step": 7370
},
{
"epoch": 0.5446494464944649,
"grad_norm": 0.4564341604709625,
"learning_rate": 5.093379346356185e-05,
"loss": 2.8084,
"step": 7380
},
{
"epoch": 0.5453874538745388,
"grad_norm": 0.4610985219478607,
"learning_rate": 5.080500638566741e-05,
"loss": 2.9255,
"step": 7390
},
{
"epoch": 0.5461254612546126,
"grad_norm": 0.46059536933898926,
"learning_rate": 5.0676213965387475e-05,
"loss": 2.851,
"step": 7400
},
{
"epoch": 0.5468634686346864,
"grad_norm": 0.482048362493515,
"learning_rate": 5.0547417057446665e-05,
"loss": 2.9626,
"step": 7410
},
{
"epoch": 0.5476014760147602,
"grad_norm": 0.4469466209411621,
"learning_rate": 5.0418616516599346e-05,
"loss": 2.8261,
"step": 7420
},
{
"epoch": 0.548339483394834,
"grad_norm": 0.4489482343196869,
"learning_rate": 5.028981319762399e-05,
"loss": 2.9388,
"step": 7430
},
{
"epoch": 0.5490774907749078,
"grad_norm": 0.4895458221435547,
"learning_rate": 5.016100795531754e-05,
"loss": 2.9598,
"step": 7440
},
{
"epoch": 0.5498154981549815,
"grad_norm": 0.45136043429374695,
"learning_rate": 5.003220164448967e-05,
"loss": 2.8466,
"step": 7450
},
{
"epoch": 0.5505535055350553,
"grad_norm": 0.4319990873336792,
"learning_rate": 4.990339511995718e-05,
"loss": 2.8589,
"step": 7460
},
{
"epoch": 0.5512915129151291,
"grad_norm": 0.4822845458984375,
"learning_rate": 4.977458923653823e-05,
"loss": 2.8766,
"step": 7470
},
{
"epoch": 0.5520295202952029,
"grad_norm": 0.4683190882205963,
"learning_rate": 4.9645784849046786e-05,
"loss": 2.9471,
"step": 7480
},
{
"epoch": 0.5527675276752767,
"grad_norm": 0.4755018353462219,
"learning_rate": 4.9516982812286854e-05,
"loss": 2.9336,
"step": 7490
},
{
"epoch": 0.5535055350553506,
"grad_norm": 0.4847009778022766,
"learning_rate": 4.938818398104685e-05,
"loss": 2.8928,
"step": 7500
},
{
"epoch": 0.5542435424354244,
"grad_norm": 0.49205484986305237,
"learning_rate": 4.92593892100939e-05,
"loss": 2.9413,
"step": 7510
},
{
"epoch": 0.5549815498154982,
"grad_norm": 0.4603287875652313,
"learning_rate": 4.913059935416822e-05,
"loss": 2.8814,
"step": 7520
},
{
"epoch": 0.555719557195572,
"grad_norm": 0.4724648594856262,
"learning_rate": 4.900181526797737e-05,
"loss": 2.9493,
"step": 7530
},
{
"epoch": 0.5564575645756458,
"grad_norm": 0.6270569562911987,
"learning_rate": 4.887303780619066e-05,
"loss": 2.9201,
"step": 7540
},
{
"epoch": 0.5571955719557196,
"grad_norm": 0.4619079828262329,
"learning_rate": 4.874426782343338e-05,
"loss": 2.915,
"step": 7550
},
{
"epoch": 0.5579335793357934,
"grad_norm": 0.45699045062065125,
"learning_rate": 4.861550617428122e-05,
"loss": 2.914,
"step": 7560
},
{
"epoch": 0.5586715867158671,
"grad_norm": 0.46511203050613403,
"learning_rate": 4.8486753713254586e-05,
"loss": 2.8837,
"step": 7570
},
{
"epoch": 0.5594095940959409,
"grad_norm": 0.4465058147907257,
"learning_rate": 4.835801129481287e-05,
"loss": 2.9087,
"step": 7580
},
{
"epoch": 0.5601476014760147,
"grad_norm": 0.4666641652584076,
"learning_rate": 4.8229279773348845e-05,
"loss": 2.9486,
"step": 7590
},
{
"epoch": 0.5608856088560885,
"grad_norm": 0.4582604765892029,
"learning_rate": 4.810056000318293e-05,
"loss": 2.9275,
"step": 7600
},
{
"epoch": 0.5616236162361624,
"grad_norm": 0.4589548408985138,
"learning_rate": 4.7971852838557565e-05,
"loss": 2.8683,
"step": 7610
},
{
"epoch": 0.5623616236162362,
"grad_norm": 0.4380606412887573,
"learning_rate": 4.78431591336316e-05,
"loss": 2.8368,
"step": 7620
},
{
"epoch": 0.56309963099631,
"grad_norm": 0.44517070055007935,
"learning_rate": 4.771447974247449e-05,
"loss": 2.8804,
"step": 7630
},
{
"epoch": 0.5638376383763838,
"grad_norm": 0.46472036838531494,
"learning_rate": 4.7585815519060694e-05,
"loss": 2.8983,
"step": 7640
},
{
"epoch": 0.5645756457564576,
"grad_norm": 0.47114098072052,
"learning_rate": 4.7457167317264064e-05,
"loss": 2.9284,
"step": 7650
},
{
"epoch": 0.5653136531365314,
"grad_norm": 0.4522678256034851,
"learning_rate": 4.732853599085207e-05,
"loss": 2.8971,
"step": 7660
},
{
"epoch": 0.5660516605166052,
"grad_norm": 0.46045982837677,
"learning_rate": 4.719992239348024e-05,
"loss": 2.844,
"step": 7670
},
{
"epoch": 0.566789667896679,
"grad_norm": 0.4543171525001526,
"learning_rate": 4.7071327378686386e-05,
"loss": 2.9121,
"step": 7680
},
{
"epoch": 0.5675276752767527,
"grad_norm": 0.48567166924476624,
"learning_rate": 4.6942751799885054e-05,
"loss": 2.9274,
"step": 7690
},
{
"epoch": 0.5682656826568265,
"grad_norm": 0.4700704514980316,
"learning_rate": 4.681419651036177e-05,
"loss": 2.9872,
"step": 7700
},
{
"epoch": 0.5690036900369003,
"grad_norm": 0.44953039288520813,
"learning_rate": 4.6685662363267415e-05,
"loss": 2.873,
"step": 7710
},
{
"epoch": 0.5697416974169742,
"grad_norm": 0.46205776929855347,
"learning_rate": 4.655715021161258e-05,
"loss": 2.8282,
"step": 7720
},
{
"epoch": 0.570479704797048,
"grad_norm": 0.4394710063934326,
"learning_rate": 4.6428660908261864e-05,
"loss": 2.8753,
"step": 7730
},
{
"epoch": 0.5712177121771218,
"grad_norm": 0.43995216488838196,
"learning_rate": 4.6300195305928243e-05,
"loss": 2.7643,
"step": 7740
},
{
"epoch": 0.5719557195571956,
"grad_norm": 0.4612707495689392,
"learning_rate": 4.617175425716741e-05,
"loss": 2.8683,
"step": 7750
},
{
"epoch": 0.5726937269372694,
"grad_norm": 0.4660702347755432,
"learning_rate": 4.604333861437207e-05,
"loss": 2.9493,
"step": 7760
},
{
"epoch": 0.5734317343173432,
"grad_norm": 0.47154900431632996,
"learning_rate": 4.591494922976637e-05,
"loss": 2.9493,
"step": 7770
},
{
"epoch": 0.574169741697417,
"grad_norm": 0.4602459967136383,
"learning_rate": 4.578658695540018e-05,
"loss": 2.9144,
"step": 7780
},
{
"epoch": 0.5749077490774908,
"grad_norm": 0.4484480917453766,
"learning_rate": 4.5658252643143435e-05,
"loss": 2.9145,
"step": 7790
},
{
"epoch": 0.5756457564575646,
"grad_norm": 0.469936341047287,
"learning_rate": 4.552994714468055e-05,
"loss": 2.8947,
"step": 7800
},
{
"epoch": 0.5763837638376383,
"grad_norm": 0.48601603507995605,
"learning_rate": 4.5401671311504616e-05,
"loss": 2.9164,
"step": 7810
},
{
"epoch": 0.5771217712177121,
"grad_norm": 0.46561533212661743,
"learning_rate": 4.5273425994912e-05,
"loss": 2.8656,
"step": 7820
},
{
"epoch": 0.5778597785977859,
"grad_norm": 0.48168033361434937,
"learning_rate": 4.5145212045996446e-05,
"loss": 2.8667,
"step": 7830
},
{
"epoch": 0.5785977859778598,
"grad_norm": 0.45122450590133667,
"learning_rate": 4.5017030315643536e-05,
"loss": 2.9668,
"step": 7840
},
{
"epoch": 0.5793357933579336,
"grad_norm": 0.4591752290725708,
"learning_rate": 4.4888881654525057e-05,
"loss": 2.8924,
"step": 7850
},
{
"epoch": 0.5800738007380074,
"grad_norm": 0.4341951906681061,
"learning_rate": 4.4760766913093325e-05,
"loss": 2.8232,
"step": 7860
},
{
"epoch": 0.5808118081180812,
"grad_norm": 0.46191418170928955,
"learning_rate": 4.463268694157556e-05,
"loss": 2.9198,
"step": 7870
},
{
"epoch": 0.581549815498155,
"grad_norm": 0.43734246492385864,
"learning_rate": 4.450464258996822e-05,
"loss": 2.8755,
"step": 7880
},
{
"epoch": 0.5822878228782288,
"grad_norm": 0.4456181228160858,
"learning_rate": 4.437663470803137e-05,
"loss": 2.8545,
"step": 7890
},
{
"epoch": 0.5830258302583026,
"grad_norm": 0.46855318546295166,
"learning_rate": 4.4248664145283054e-05,
"loss": 2.8658,
"step": 7900
},
{
"epoch": 0.5837638376383764,
"grad_norm": 0.4666096568107605,
"learning_rate": 4.4120731750993645e-05,
"loss": 2.9317,
"step": 7910
},
{
"epoch": 0.5845018450184502,
"grad_norm": 0.46038341522216797,
"learning_rate": 4.3992838374180234e-05,
"loss": 2.9288,
"step": 7920
},
{
"epoch": 0.5852398523985239,
"grad_norm": 0.47123417258262634,
"learning_rate": 4.386498486360094e-05,
"loss": 2.9348,
"step": 7930
},
{
"epoch": 0.5859778597785977,
"grad_norm": 0.43836262822151184,
"learning_rate": 4.373717206774935e-05,
"loss": 2.8594,
"step": 7940
},
{
"epoch": 0.5867158671586716,
"grad_norm": 0.46412384510040283,
"learning_rate": 4.360940083484881e-05,
"loss": 2.9131,
"step": 7950
},
{
"epoch": 0.5874538745387454,
"grad_norm": 0.43723878264427185,
"learning_rate": 4.3481672012846865e-05,
"loss": 2.9116,
"step": 7960
},
{
"epoch": 0.5881918819188192,
"grad_norm": 0.46796315908432007,
"learning_rate": 4.335398644940957e-05,
"loss": 2.9236,
"step": 7970
},
{
"epoch": 0.588929889298893,
"grad_norm": 0.4761864244937897,
"learning_rate": 4.322634499191594e-05,
"loss": 2.8988,
"step": 7980
},
{
"epoch": 0.5896678966789668,
"grad_norm": 0.4379028081893921,
"learning_rate": 4.309874848745225e-05,
"loss": 2.851,
"step": 7990
},
{
"epoch": 0.5904059040590406,
"grad_norm": 0.4515070617198944,
"learning_rate": 4.297119778280645e-05,
"loss": 2.8823,
"step": 8000
},
{
"epoch": 0.5911439114391144,
"grad_norm": 0.456480473279953,
"learning_rate": 4.2843693724462555e-05,
"loss": 2.9163,
"step": 8010
},
{
"epoch": 0.5918819188191882,
"grad_norm": 0.4556421935558319,
"learning_rate": 4.271623715859501e-05,
"loss": 2.8997,
"step": 8020
},
{
"epoch": 0.592619926199262,
"grad_norm": 0.4618515372276306,
"learning_rate": 4.2588828931063086e-05,
"loss": 2.9223,
"step": 8030
},
{
"epoch": 0.5933579335793358,
"grad_norm": 0.4617830812931061,
"learning_rate": 4.246146988740525e-05,
"loss": 2.8476,
"step": 8040
},
{
"epoch": 0.5940959409594095,
"grad_norm": 0.43721622228622437,
"learning_rate": 4.233416087283354e-05,
"loss": 2.9253,
"step": 8050
},
{
"epoch": 0.5948339483394834,
"grad_norm": 0.43407517671585083,
"learning_rate": 4.2206902732228015e-05,
"loss": 2.9307,
"step": 8060
},
{
"epoch": 0.5955719557195572,
"grad_norm": 0.4590218663215637,
"learning_rate": 4.207969631013109e-05,
"loss": 2.9194,
"step": 8070
},
{
"epoch": 0.596309963099631,
"grad_norm": 0.45232662558555603,
"learning_rate": 4.195254245074196e-05,
"loss": 2.814,
"step": 8080
},
{
"epoch": 0.5970479704797048,
"grad_norm": 0.47659075260162354,
"learning_rate": 4.1825441997911016e-05,
"loss": 2.8991,
"step": 8090
},
{
"epoch": 0.5977859778597786,
"grad_norm": 0.4390777349472046,
"learning_rate": 4.169839579513415e-05,
"loss": 2.8377,
"step": 8100
},
{
"epoch": 0.5985239852398524,
"grad_norm": 0.44624418020248413,
"learning_rate": 4.1571404685547265e-05,
"loss": 2.9126,
"step": 8110
},
{
"epoch": 0.5992619926199262,
"grad_norm": 0.4411090314388275,
"learning_rate": 4.14444695119207e-05,
"loss": 2.8661,
"step": 8120
},
{
"epoch": 0.6,
"grad_norm": 0.45906946063041687,
"learning_rate": 4.131759111665349e-05,
"loss": 2.8862,
"step": 8130
},
{
"epoch": 0.6007380073800738,
"grad_norm": 0.450738787651062,
"learning_rate": 4.1190770341767884e-05,
"loss": 2.8788,
"step": 8140
},
{
"epoch": 0.6014760147601476,
"grad_norm": 0.4635327458381653,
"learning_rate": 4.1064008028903766e-05,
"loss": 2.8856,
"step": 8150
},
{
"epoch": 0.6022140221402214,
"grad_norm": 0.46390798687934875,
"learning_rate": 4.093730501931301e-05,
"loss": 2.8435,
"step": 8160
},
{
"epoch": 0.6029520295202953,
"grad_norm": 0.46583694219589233,
"learning_rate": 4.0810662153853955e-05,
"loss": 2.9068,
"step": 8170
},
{
"epoch": 0.603690036900369,
"grad_norm": 0.441485732793808,
"learning_rate": 4.068408027298576e-05,
"loss": 2.9141,
"step": 8180
},
{
"epoch": 0.6044280442804428,
"grad_norm": 0.43635720014572144,
"learning_rate": 4.0557560216762884e-05,
"loss": 2.8165,
"step": 8190
},
{
"epoch": 0.6051660516605166,
"grad_norm": 0.45056867599487305,
"learning_rate": 4.0431102824829495e-05,
"loss": 2.8923,
"step": 8200
},
{
"epoch": 0.6059040590405904,
"grad_norm": 0.47618359327316284,
"learning_rate": 4.030470893641387e-05,
"loss": 2.8337,
"step": 8210
},
{
"epoch": 0.6066420664206642,
"grad_norm": 0.46678489446640015,
"learning_rate": 4.0178379390322896e-05,
"loss": 2.9041,
"step": 8220
},
{
"epoch": 0.607380073800738,
"grad_norm": 0.45858731865882874,
"learning_rate": 4.0052115024936396e-05,
"loss": 2.8919,
"step": 8230
},
{
"epoch": 0.6081180811808118,
"grad_norm": 0.46500325202941895,
"learning_rate": 3.9925916678201656e-05,
"loss": 2.7873,
"step": 8240
},
{
"epoch": 0.6088560885608856,
"grad_norm": 0.4576093256473541,
"learning_rate": 3.9799785187627844e-05,
"loss": 2.9581,
"step": 8250
},
{
"epoch": 0.6095940959409594,
"grad_norm": 0.4603584408760071,
"learning_rate": 3.96737213902804e-05,
"loss": 2.932,
"step": 8260
},
{
"epoch": 0.6103321033210332,
"grad_norm": 0.4474504888057709,
"learning_rate": 3.954772612277556e-05,
"loss": 2.8907,
"step": 8270
},
{
"epoch": 0.6110701107011071,
"grad_norm": 0.4676888585090637,
"learning_rate": 3.942180022127475e-05,
"loss": 2.9279,
"step": 8280
},
{
"epoch": 0.6118081180811809,
"grad_norm": 0.4762161374092102,
"learning_rate": 3.929594452147903e-05,
"loss": 2.8668,
"step": 8290
},
{
"epoch": 0.6125461254612546,
"grad_norm": 0.45031213760375977,
"learning_rate": 3.917015985862364e-05,
"loss": 3.0203,
"step": 8300
},
{
"epoch": 0.6132841328413284,
"grad_norm": 0.4627397656440735,
"learning_rate": 3.904444706747227e-05,
"loss": 2.8669,
"step": 8310
},
{
"epoch": 0.6140221402214022,
"grad_norm": 0.4964381456375122,
"learning_rate": 3.891880698231176e-05,
"loss": 2.8888,
"step": 8320
},
{
"epoch": 0.614760147601476,
"grad_norm": 0.4690164029598236,
"learning_rate": 3.879324043694639e-05,
"loss": 2.8772,
"step": 8330
},
{
"epoch": 0.6154981549815498,
"grad_norm": 0.46316999197006226,
"learning_rate": 3.8667748264692355e-05,
"loss": 2.9203,
"step": 8340
},
{
"epoch": 0.6162361623616236,
"grad_norm": 0.46457648277282715,
"learning_rate": 3.854233129837233e-05,
"loss": 2.8959,
"step": 8350
},
{
"epoch": 0.6169741697416974,
"grad_norm": 0.46210619807243347,
"learning_rate": 3.841699037030989e-05,
"loss": 2.9754,
"step": 8360
},
{
"epoch": 0.6177121771217712,
"grad_norm": 0.4708150029182434,
"learning_rate": 3.829172631232395e-05,
"loss": 2.8779,
"step": 8370
},
{
"epoch": 0.618450184501845,
"grad_norm": 0.4539421498775482,
"learning_rate": 3.8166539955723315e-05,
"loss": 2.7857,
"step": 8380
},
{
"epoch": 0.6191881918819189,
"grad_norm": 0.4383450150489807,
"learning_rate": 3.80414321313011e-05,
"loss": 2.9466,
"step": 8390
},
{
"epoch": 0.6199261992619927,
"grad_norm": 0.47667232155799866,
"learning_rate": 3.791640366932926e-05,
"loss": 2.8896,
"step": 8400
},
{
"epoch": 0.6206642066420665,
"grad_norm": 0.47078999876976013,
"learning_rate": 3.7791455399553054e-05,
"loss": 2.8787,
"step": 8410
},
{
"epoch": 0.6214022140221402,
"grad_norm": 0.4621264934539795,
"learning_rate": 3.7666588151185586e-05,
"loss": 2.9516,
"step": 8420
},
{
"epoch": 0.622140221402214,
"grad_norm": 0.4561121165752411,
"learning_rate": 3.754180275290222e-05,
"loss": 2.8712,
"step": 8430
},
{
"epoch": 0.6228782287822878,
"grad_norm": 0.4745158851146698,
"learning_rate": 3.741710003283515e-05,
"loss": 2.9942,
"step": 8440
},
{
"epoch": 0.6236162361623616,
"grad_norm": 0.4506776034832001,
"learning_rate": 3.729248081856788e-05,
"loss": 2.8662,
"step": 8450
},
{
"epoch": 0.6243542435424354,
"grad_norm": 0.4925256073474884,
"learning_rate": 3.716794593712973e-05,
"loss": 2.9148,
"step": 8460
},
{
"epoch": 0.6250922509225092,
"grad_norm": 0.4477274715900421,
"learning_rate": 3.704349621499032e-05,
"loss": 2.8946,
"step": 8470
},
{
"epoch": 0.625830258302583,
"grad_norm": 0.45974335074424744,
"learning_rate": 3.691913247805415e-05,
"loss": 2.8444,
"step": 8480
},
{
"epoch": 0.6265682656826568,
"grad_norm": 0.4468931555747986,
"learning_rate": 3.6794855551655095e-05,
"loss": 2.8183,
"step": 8490
},
{
"epoch": 0.6273062730627307,
"grad_norm": 0.45352327823638916,
"learning_rate": 3.6670666260550866e-05,
"loss": 2.8385,
"step": 8500
},
{
"epoch": 0.6280442804428045,
"grad_norm": 0.48543328046798706,
"learning_rate": 3.654656542891762e-05,
"loss": 2.8982,
"step": 8510
},
{
"epoch": 0.6287822878228783,
"grad_norm": 0.47315549850463867,
"learning_rate": 3.642255388034448e-05,
"loss": 2.8477,
"step": 8520
},
{
"epoch": 0.629520295202952,
"grad_norm": 0.4466278851032257,
"learning_rate": 3.629863243782799e-05,
"loss": 2.9499,
"step": 8530
},
{
"epoch": 0.6302583025830258,
"grad_norm": 0.4634998142719269,
"learning_rate": 3.617480192376676e-05,
"loss": 2.9209,
"step": 8540
},
{
"epoch": 0.6309963099630996,
"grad_norm": 0.4444449841976166,
"learning_rate": 3.6051063159955914e-05,
"loss": 2.8547,
"step": 8550
},
{
"epoch": 0.6317343173431734,
"grad_norm": 0.4805346727371216,
"learning_rate": 3.592741696758171e-05,
"loss": 2.9504,
"step": 8560
},
{
"epoch": 0.6324723247232472,
"grad_norm": 0.4576335549354553,
"learning_rate": 3.580386416721605e-05,
"loss": 2.8166,
"step": 8570
},
{
"epoch": 0.633210332103321,
"grad_norm": 0.48051634430885315,
"learning_rate": 3.568040557881106e-05,
"loss": 2.8457,
"step": 8580
},
{
"epoch": 0.6339483394833948,
"grad_norm": 0.45053961873054504,
"learning_rate": 3.55570420216936e-05,
"loss": 2.8554,
"step": 8590
},
{
"epoch": 0.6346863468634686,
"grad_norm": 0.4763762652873993,
"learning_rate": 3.543377431455991e-05,
"loss": 2.9245,
"step": 8600
},
{
"epoch": 0.6354243542435425,
"grad_norm": 0.466516375541687,
"learning_rate": 3.531060327547003e-05,
"loss": 2.8784,
"step": 8610
},
{
"epoch": 0.6361623616236163,
"grad_norm": 0.4508006274700165,
"learning_rate": 3.51875297218426e-05,
"loss": 2.8572,
"step": 8620
},
{
"epoch": 0.6369003690036901,
"grad_norm": 0.43419796228408813,
"learning_rate": 3.506455447044923e-05,
"loss": 2.9553,
"step": 8630
},
{
"epoch": 0.6376383763837639,
"grad_norm": 0.4657207131385803,
"learning_rate": 3.494167833740912e-05,
"loss": 2.9388,
"step": 8640
},
{
"epoch": 0.6383763837638377,
"grad_norm": 0.47769656777381897,
"learning_rate": 3.481890213818374e-05,
"loss": 2.889,
"step": 8650
},
{
"epoch": 0.6391143911439114,
"grad_norm": 0.452332466840744,
"learning_rate": 3.469622668757132e-05,
"loss": 2.8618,
"step": 8660
},
{
"epoch": 0.6398523985239852,
"grad_norm": 0.44228044152259827,
"learning_rate": 3.457365279970147e-05,
"loss": 2.858,
"step": 8670
},
{
"epoch": 0.640590405904059,
"grad_norm": 0.45381829142570496,
"learning_rate": 3.4451181288029835e-05,
"loss": 2.9324,
"step": 8680
},
{
"epoch": 0.6413284132841328,
"grad_norm": 0.45243462920188904,
"learning_rate": 3.4328812965332566e-05,
"loss": 2.8569,
"step": 8690
},
{
"epoch": 0.6420664206642066,
"grad_norm": 0.44624003767967224,
"learning_rate": 3.420654864370107e-05,
"loss": 2.8305,
"step": 8700
},
{
"epoch": 0.6428044280442804,
"grad_norm": 0.45331937074661255,
"learning_rate": 3.408438913453652e-05,
"loss": 2.9233,
"step": 8710
},
{
"epoch": 0.6435424354243543,
"grad_norm": 0.46031826734542847,
"learning_rate": 3.396233524854453e-05,
"loss": 2.8136,
"step": 8720
},
{
"epoch": 0.6442804428044281,
"grad_norm": 0.4405251443386078,
"learning_rate": 3.384038779572975e-05,
"loss": 2.8196,
"step": 8730
},
{
"epoch": 0.6450184501845019,
"grad_norm": 0.433918297290802,
"learning_rate": 3.371854758539047e-05,
"loss": 2.828,
"step": 8740
},
{
"epoch": 0.6457564575645757,
"grad_norm": 0.437752366065979,
"learning_rate": 3.3596815426113285e-05,
"loss": 2.9084,
"step": 8750
},
{
"epoch": 0.6464944649446495,
"grad_norm": 0.4461667537689209,
"learning_rate": 3.3475192125767715e-05,
"loss": 2.9163,
"step": 8760
},
{
"epoch": 0.6472324723247233,
"grad_norm": 0.44983482360839844,
"learning_rate": 3.335367849150084e-05,
"loss": 2.8624,
"step": 8770
},
{
"epoch": 0.647970479704797,
"grad_norm": 0.444402813911438,
"learning_rate": 3.323227532973193e-05,
"loss": 2.8645,
"step": 8780
},
{
"epoch": 0.6487084870848708,
"grad_norm": 0.47475096583366394,
"learning_rate": 3.311098344614715e-05,
"loss": 2.8599,
"step": 8790
},
{
"epoch": 0.6494464944649446,
"grad_norm": 0.42691770195961,
"learning_rate": 3.298980364569413e-05,
"loss": 2.9367,
"step": 8800
},
{
"epoch": 0.6501845018450184,
"grad_norm": 0.43761834502220154,
"learning_rate": 3.2868736732576696e-05,
"loss": 2.8071,
"step": 8810
},
{
"epoch": 0.6509225092250922,
"grad_norm": 0.4337967336177826,
"learning_rate": 3.274778351024949e-05,
"loss": 2.7961,
"step": 8820
},
{
"epoch": 0.6516605166051661,
"grad_norm": 0.4518975615501404,
"learning_rate": 3.262694478141265e-05,
"loss": 2.8445,
"step": 8830
},
{
"epoch": 0.6523985239852399,
"grad_norm": 0.44520917534828186,
"learning_rate": 3.250622134800651e-05,
"loss": 2.8298,
"step": 8840
},
{
"epoch": 0.6531365313653137,
"grad_norm": 0.47246819734573364,
"learning_rate": 3.238561401120619e-05,
"loss": 2.8721,
"step": 8850
},
{
"epoch": 0.6538745387453875,
"grad_norm": 0.46341249346733093,
"learning_rate": 3.226512357141639e-05,
"loss": 2.8465,
"step": 8860
},
{
"epoch": 0.6546125461254613,
"grad_norm": 0.4418579339981079,
"learning_rate": 3.214475082826602e-05,
"loss": 2.7495,
"step": 8870
},
{
"epoch": 0.6553505535055351,
"grad_norm": 0.4572698771953583,
"learning_rate": 3.2024496580602895e-05,
"loss": 2.8405,
"step": 8880
},
{
"epoch": 0.6560885608856089,
"grad_norm": 0.4518590569496155,
"learning_rate": 3.1904361626488464e-05,
"loss": 2.8698,
"step": 8890
},
{
"epoch": 0.6568265682656826,
"grad_norm": 0.49694785475730896,
"learning_rate": 3.178434676319243e-05,
"loss": 2.9178,
"step": 8900
},
{
"epoch": 0.6575645756457564,
"grad_norm": 0.44036176800727844,
"learning_rate": 3.166445278718758e-05,
"loss": 2.9042,
"step": 8910
},
{
"epoch": 0.6583025830258302,
"grad_norm": 0.4740366041660309,
"learning_rate": 3.154468049414444e-05,
"loss": 2.791,
"step": 8920
},
{
"epoch": 0.659040590405904,
"grad_norm": 0.44894149899482727,
"learning_rate": 3.1425030678925944e-05,
"loss": 2.8882,
"step": 8930
},
{
"epoch": 0.6597785977859778,
"grad_norm": 0.45504188537597656,
"learning_rate": 3.1305504135582244e-05,
"loss": 2.82,
"step": 8940
},
{
"epoch": 0.6605166051660517,
"grad_norm": 0.45306116342544556,
"learning_rate": 3.118610165734539e-05,
"loss": 2.8076,
"step": 8950
},
{
"epoch": 0.6612546125461255,
"grad_norm": 0.4355803430080414,
"learning_rate": 3.106682403662409e-05,
"loss": 2.8458,
"step": 8960
},
{
"epoch": 0.6619926199261993,
"grad_norm": 0.45864707231521606,
"learning_rate": 3.094767206499844e-05,
"loss": 2.7888,
"step": 8970
},
{
"epoch": 0.6627306273062731,
"grad_norm": 0.4467925727367401,
"learning_rate": 3.082864653321466e-05,
"loss": 2.8862,
"step": 8980
},
{
"epoch": 0.6634686346863469,
"grad_norm": 0.4361802935600281,
"learning_rate": 3.0709748231179855e-05,
"loss": 2.8405,
"step": 8990
},
{
"epoch": 0.6642066420664207,
"grad_norm": 0.4502997398376465,
"learning_rate": 3.059097794795681e-05,
"loss": 2.8651,
"step": 9000
},
{
"epoch": 0.6649446494464945,
"grad_norm": 0.446232408285141,
"learning_rate": 3.0472336471758678e-05,
"loss": 2.9009,
"step": 9010
},
{
"epoch": 0.6656826568265682,
"grad_norm": 0.4600978493690491,
"learning_rate": 3.0353824589943834e-05,
"loss": 2.8842,
"step": 9020
},
{
"epoch": 0.666420664206642,
"grad_norm": 0.45147082209587097,
"learning_rate": 3.0235443089010562e-05,
"loss": 2.842,
"step": 9030
},
{
"epoch": 0.6671586715867158,
"grad_norm": 0.470324844121933,
"learning_rate": 3.0117192754591893e-05,
"loss": 2.9098,
"step": 9040
},
{
"epoch": 0.6678966789667896,
"grad_norm": 0.4519864320755005,
"learning_rate": 2.999907437145042e-05,
"loss": 2.917,
"step": 9050
},
{
"epoch": 0.6686346863468635,
"grad_norm": 0.44655749201774597,
"learning_rate": 2.9881088723472966e-05,
"loss": 2.9205,
"step": 9060
},
{
"epoch": 0.6693726937269373,
"grad_norm": 0.45969992876052856,
"learning_rate": 2.9763236593665533e-05,
"loss": 2.8726,
"step": 9070
},
{
"epoch": 0.6701107011070111,
"grad_norm": 0.45693284273147583,
"learning_rate": 2.9645518764148007e-05,
"loss": 2.8753,
"step": 9080
},
{
"epoch": 0.6708487084870849,
"grad_norm": 0.442354291677475,
"learning_rate": 2.9527936016149006e-05,
"loss": 2.8377,
"step": 9090
},
{
"epoch": 0.6715867158671587,
"grad_norm": 0.4796278476715088,
"learning_rate": 2.9410489130000684e-05,
"loss": 2.8303,
"step": 9100
},
{
"epoch": 0.6723247232472325,
"grad_norm": 0.4597807824611664,
"learning_rate": 2.9293178885133525e-05,
"loss": 2.8325,
"step": 9110
},
{
"epoch": 0.6730627306273063,
"grad_norm": 0.47112002968788147,
"learning_rate": 2.917600606007127e-05,
"loss": 2.8479,
"step": 9120
},
{
"epoch": 0.67380073800738,
"grad_norm": 0.4425598978996277,
"learning_rate": 2.905897143242562e-05,
"loss": 2.8416,
"step": 9130
},
{
"epoch": 0.6745387453874538,
"grad_norm": 0.4444707930088043,
"learning_rate": 2.8942075778891153e-05,
"loss": 2.9409,
"step": 9140
},
{
"epoch": 0.6752767527675276,
"grad_norm": 0.4575837254524231,
"learning_rate": 2.882531987524017e-05,
"loss": 2.8615,
"step": 9150
},
{
"epoch": 0.6760147601476014,
"grad_norm": 0.4663306176662445,
"learning_rate": 2.8708704496317474e-05,
"loss": 2.8184,
"step": 9160
},
{
"epoch": 0.6767527675276753,
"grad_norm": 0.441550076007843,
"learning_rate": 2.8592230416035335e-05,
"loss": 2.8981,
"step": 9170
},
{
"epoch": 0.6774907749077491,
"grad_norm": 0.47013741731643677,
"learning_rate": 2.8475898407368296e-05,
"loss": 2.9034,
"step": 9180
},
{
"epoch": 0.6782287822878229,
"grad_norm": 0.47934868931770325,
"learning_rate": 2.8359709242348032e-05,
"loss": 2.9483,
"step": 9190
},
{
"epoch": 0.6789667896678967,
"grad_norm": 0.44904670119285583,
"learning_rate": 2.824366369205825e-05,
"loss": 2.9038,
"step": 9200
},
{
"epoch": 0.6797047970479705,
"grad_norm": 0.4706343710422516,
"learning_rate": 2.8127762526629553e-05,
"loss": 2.8976,
"step": 9210
},
{
"epoch": 0.6804428044280443,
"grad_norm": 0.4544294774532318,
"learning_rate": 2.801200651523438e-05,
"loss": 2.8875,
"step": 9220
},
{
"epoch": 0.6811808118081181,
"grad_norm": 0.4476546347141266,
"learning_rate": 2.7896396426081844e-05,
"loss": 2.8378,
"step": 9230
},
{
"epoch": 0.6819188191881919,
"grad_norm": 0.4503355920314789,
"learning_rate": 2.7780933026412602e-05,
"loss": 2.8917,
"step": 9240
},
{
"epoch": 0.6826568265682657,
"grad_norm": 0.4393197298049927,
"learning_rate": 2.766561708249387e-05,
"loss": 2.7785,
"step": 9250
},
{
"epoch": 0.6833948339483394,
"grad_norm": 0.45384228229522705,
"learning_rate": 2.7550449359614272e-05,
"loss": 2.8712,
"step": 9260
},
{
"epoch": 0.6841328413284132,
"grad_norm": 0.462931752204895,
"learning_rate": 2.743543062207876e-05,
"loss": 2.9299,
"step": 9270
},
{
"epoch": 0.6848708487084871,
"grad_norm": 0.4446216821670532,
"learning_rate": 2.7320561633203566e-05,
"loss": 2.93,
"step": 9280
},
{
"epoch": 0.6856088560885609,
"grad_norm": 0.4498085677623749,
"learning_rate": 2.7205843155311094e-05,
"loss": 2.8614,
"step": 9290
},
{
"epoch": 0.6863468634686347,
"grad_norm": 0.44905975461006165,
"learning_rate": 2.7091275949724926e-05,
"loss": 2.8681,
"step": 9300
},
{
"epoch": 0.6870848708487085,
"grad_norm": 0.4424300491809845,
"learning_rate": 2.6976860776764713e-05,
"loss": 2.8048,
"step": 9310
},
{
"epoch": 0.6878228782287823,
"grad_norm": 0.46064937114715576,
"learning_rate": 2.6862598395741136e-05,
"loss": 2.8376,
"step": 9320
},
{
"epoch": 0.6885608856088561,
"grad_norm": 0.45401063561439514,
"learning_rate": 2.6748489564950908e-05,
"loss": 2.8168,
"step": 9330
},
{
"epoch": 0.6892988929889299,
"grad_norm": 0.4572742283344269,
"learning_rate": 2.6634535041671693e-05,
"loss": 2.8182,
"step": 9340
},
{
"epoch": 0.6900369003690037,
"grad_norm": 0.4515658915042877,
"learning_rate": 2.652073558215711e-05,
"loss": 2.8569,
"step": 9350
},
{
"epoch": 0.6907749077490775,
"grad_norm": 0.44633907079696655,
"learning_rate": 2.64070919416317e-05,
"loss": 2.8684,
"step": 9360
},
{
"epoch": 0.6915129151291513,
"grad_norm": 0.4616515636444092,
"learning_rate": 2.6293604874285927e-05,
"loss": 2.8791,
"step": 9370
},
{
"epoch": 0.692250922509225,
"grad_norm": 0.4603336751461029,
"learning_rate": 2.618027513327116e-05,
"loss": 2.8685,
"step": 9380
},
{
"epoch": 0.6929889298892989,
"grad_norm": 0.4635460376739502,
"learning_rate": 2.6067103470694672e-05,
"loss": 2.8819,
"step": 9390
},
{
"epoch": 0.6937269372693727,
"grad_norm": 0.446821004152298,
"learning_rate": 2.5954090637614658e-05,
"loss": 2.8775,
"step": 9400
},
{
"epoch": 0.6944649446494465,
"grad_norm": 0.45208224654197693,
"learning_rate": 2.5841237384035265e-05,
"loss": 2.9185,
"step": 9410
},
{
"epoch": 0.6952029520295203,
"grad_norm": 0.43966442346572876,
"learning_rate": 2.5728544458901593e-05,
"loss": 2.844,
"step": 9420
},
{
"epoch": 0.6959409594095941,
"grad_norm": 0.4660171866416931,
"learning_rate": 2.5616012610094704e-05,
"loss": 2.8533,
"step": 9430
},
{
"epoch": 0.6966789667896679,
"grad_norm": 0.4844834804534912,
"learning_rate": 2.5503642584426712e-05,
"loss": 2.9139,
"step": 9440
},
{
"epoch": 0.6974169741697417,
"grad_norm": 0.4675824046134949,
"learning_rate": 2.5391435127635805e-05,
"loss": 2.857,
"step": 9450
},
{
"epoch": 0.6981549815498155,
"grad_norm": 0.4488329291343689,
"learning_rate": 2.5279390984381264e-05,
"loss": 2.8484,
"step": 9460
},
{
"epoch": 0.6988929889298893,
"grad_norm": 0.4558933675289154,
"learning_rate": 2.5167510898238566e-05,
"loss": 2.8784,
"step": 9470
},
{
"epoch": 0.6996309963099631,
"grad_norm": 0.45454517006874084,
"learning_rate": 2.5055795611694433e-05,
"loss": 2.8075,
"step": 9480
},
{
"epoch": 0.7003690036900369,
"grad_norm": 0.4401450455188751,
"learning_rate": 2.4944245866141886e-05,
"loss": 2.8661,
"step": 9490
},
{
"epoch": 0.7011070110701108,
"grad_norm": 0.42718032002449036,
"learning_rate": 2.4832862401875378e-05,
"loss": 2.8306,
"step": 9500
},
{
"epoch": 0.7018450184501845,
"grad_norm": 0.4444067180156708,
"learning_rate": 2.472164595808576e-05,
"loss": 2.887,
"step": 9510
},
{
"epoch": 0.7025830258302583,
"grad_norm": 0.4388265311717987,
"learning_rate": 2.461059727285558e-05,
"loss": 2.9248,
"step": 9520
},
{
"epoch": 0.7033210332103321,
"grad_norm": 0.4537127614021301,
"learning_rate": 2.449971708315397e-05,
"loss": 2.866,
"step": 9530
},
{
"epoch": 0.7040590405904059,
"grad_norm": 0.4571674168109894,
"learning_rate": 2.4389006124831893e-05,
"loss": 2.8524,
"step": 9540
},
{
"epoch": 0.7047970479704797,
"grad_norm": 0.475065678358078,
"learning_rate": 2.4278465132617207e-05,
"loss": 2.9086,
"step": 9550
},
{
"epoch": 0.7055350553505535,
"grad_norm": 0.4491478204727173,
"learning_rate": 2.4168094840109785e-05,
"loss": 2.8496,
"step": 9560
},
{
"epoch": 0.7062730627306273,
"grad_norm": 0.4396122694015503,
"learning_rate": 2.4057895979776683e-05,
"loss": 2.8542,
"step": 9570
},
{
"epoch": 0.7070110701107011,
"grad_norm": 0.45730844140052795,
"learning_rate": 2.394786928294726e-05,
"loss": 2.8448,
"step": 9580
},
{
"epoch": 0.7077490774907749,
"grad_norm": 11.993217468261719,
"learning_rate": 2.3838015479808263e-05,
"loss": 2.8686,
"step": 9590
},
{
"epoch": 0.7084870848708487,
"grad_norm": 0.4676622450351715,
"learning_rate": 2.3728335299399106e-05,
"loss": 2.8195,
"step": 9600
},
{
"epoch": 0.7092250922509226,
"grad_norm": 0.4665907621383667,
"learning_rate": 2.3618829469606912e-05,
"loss": 2.8851,
"step": 9610
},
{
"epoch": 0.7099630996309964,
"grad_norm": 0.4478704631328583,
"learning_rate": 2.3509498717161804e-05,
"loss": 2.8631,
"step": 9620
},
{
"epoch": 0.7107011070110701,
"grad_norm": 0.4518534541130066,
"learning_rate": 2.3400343767631944e-05,
"loss": 2.8542,
"step": 9630
},
{
"epoch": 0.7114391143911439,
"grad_norm": 0.45083850622177124,
"learning_rate": 2.329136534541882e-05,
"loss": 2.8447,
"step": 9640
},
{
"epoch": 0.7121771217712177,
"grad_norm": 0.44704335927963257,
"learning_rate": 2.3182564173752396e-05,
"loss": 2.8001,
"step": 9650
},
{
"epoch": 0.7129151291512915,
"grad_norm": 0.459086149930954,
"learning_rate": 2.3073940974686337e-05,
"loss": 2.8562,
"step": 9660
},
{
"epoch": 0.7136531365313653,
"grad_norm": 0.4504683017730713,
"learning_rate": 2.296549646909315e-05,
"loss": 2.8153,
"step": 9670
},
{
"epoch": 0.7143911439114391,
"grad_norm": 0.4484894275665283,
"learning_rate": 2.2857231376659516e-05,
"loss": 2.8652,
"step": 9680
},
{
"epoch": 0.7151291512915129,
"grad_norm": 0.44552645087242126,
"learning_rate": 2.274914641588141e-05,
"loss": 2.8544,
"step": 9690
},
{
"epoch": 0.7158671586715867,
"grad_norm": 0.44962164759635925,
"learning_rate": 2.2641242304059394e-05,
"loss": 2.809,
"step": 9700
},
{
"epoch": 0.7166051660516605,
"grad_norm": 0.4649772047996521,
"learning_rate": 2.2533519757293803e-05,
"loss": 2.9047,
"step": 9710
},
{
"epoch": 0.7173431734317344,
"grad_norm": 0.44465893507003784,
"learning_rate": 2.242597949048008e-05,
"loss": 2.9289,
"step": 9720
},
{
"epoch": 0.7180811808118082,
"grad_norm": 0.4587944746017456,
"learning_rate": 2.2318622217303935e-05,
"loss": 2.9381,
"step": 9730
},
{
"epoch": 0.718819188191882,
"grad_norm": 0.45747023820877075,
"learning_rate": 2.221144865023666e-05,
"loss": 2.8596,
"step": 9740
},
{
"epoch": 0.7195571955719557,
"grad_norm": 0.4500565528869629,
"learning_rate": 2.2104459500530362e-05,
"loss": 2.8122,
"step": 9750
},
{
"epoch": 0.7202952029520295,
"grad_norm": 0.44870901107788086,
"learning_rate": 2.1997655478213313e-05,
"loss": 2.8318,
"step": 9760
},
{
"epoch": 0.7210332103321033,
"grad_norm": 0.46823742985725403,
"learning_rate": 2.1891037292085175e-05,
"loss": 2.7682,
"step": 9770
},
{
"epoch": 0.7217712177121771,
"grad_norm": 0.4822959899902344,
"learning_rate": 2.1784605649712324e-05,
"loss": 2.8845,
"step": 9780
},
{
"epoch": 0.7225092250922509,
"grad_norm": 0.4569961726665497,
"learning_rate": 2.167836125742315e-05,
"loss": 2.8073,
"step": 9790
},
{
"epoch": 0.7232472324723247,
"grad_norm": 0.5003052949905396,
"learning_rate": 2.1572304820303363e-05,
"loss": 2.966,
"step": 9800
},
{
"epoch": 0.7239852398523985,
"grad_norm": 0.4504786431789398,
"learning_rate": 2.1466437042191297e-05,
"loss": 2.8226,
"step": 9810
},
{
"epoch": 0.7247232472324723,
"grad_norm": 0.4485565423965454,
"learning_rate": 2.1360758625673327e-05,
"loss": 2.8301,
"step": 9820
},
{
"epoch": 0.7254612546125462,
"grad_norm": 0.46124571561813354,
"learning_rate": 2.1255270272079042e-05,
"loss": 2.8485,
"step": 9830
},
{
"epoch": 0.72619926199262,
"grad_norm": 0.4612502455711365,
"learning_rate": 2.1149972681476765e-05,
"loss": 2.8276,
"step": 9840
},
{
"epoch": 0.7269372693726938,
"grad_norm": 0.45740193128585815,
"learning_rate": 2.104486655266879e-05,
"loss": 2.8669,
"step": 9850
},
{
"epoch": 0.7276752767527676,
"grad_norm": 0.47378960251808167,
"learning_rate": 2.0939952583186807e-05,
"loss": 2.8149,
"step": 9860
},
{
"epoch": 0.7284132841328413,
"grad_norm": 0.45929577946662903,
"learning_rate": 2.0835231469287232e-05,
"loss": 2.8346,
"step": 9870
},
{
"epoch": 0.7291512915129151,
"grad_norm": 0.45453017950057983,
"learning_rate": 2.0730703905946612e-05,
"loss": 2.8851,
"step": 9880
},
{
"epoch": 0.7298892988929889,
"grad_norm": 0.4465833604335785,
"learning_rate": 2.0626370586857007e-05,
"loss": 2.8381,
"step": 9890
},
{
"epoch": 0.7306273062730627,
"grad_norm": 0.46699321269989014,
"learning_rate": 2.052223220442139e-05,
"loss": 2.8394,
"step": 9900
},
{
"epoch": 0.7313653136531365,
"grad_norm": 0.4374259412288666,
"learning_rate": 2.0418289449749027e-05,
"loss": 2.8501,
"step": 9910
},
{
"epoch": 0.7321033210332103,
"grad_norm": 0.4604252576828003,
"learning_rate": 2.0314543012650933e-05,
"loss": 2.8711,
"step": 9920
},
{
"epoch": 0.7328413284132841,
"grad_norm": 0.45612022280693054,
"learning_rate": 2.0210993581635256e-05,
"loss": 2.844,
"step": 9930
},
{
"epoch": 0.7335793357933579,
"grad_norm": 0.43427881598472595,
"learning_rate": 2.0107641843902726e-05,
"loss": 2.8084,
"step": 9940
},
{
"epoch": 0.7343173431734318,
"grad_norm": 0.4502193331718445,
"learning_rate": 2.0004488485342088e-05,
"loss": 2.909,
"step": 9950
},
{
"epoch": 0.7350553505535056,
"grad_norm": 0.44448336958885193,
"learning_rate": 1.9901534190525566e-05,
"loss": 2.8662,
"step": 9960
},
{
"epoch": 0.7357933579335794,
"grad_norm": 0.4308652877807617,
"learning_rate": 1.9798779642704297e-05,
"loss": 2.7882,
"step": 9970
},
{
"epoch": 0.7365313653136532,
"grad_norm": 0.4563472867012024,
"learning_rate": 1.96962255238038e-05,
"loss": 2.8956,
"step": 9980
},
{
"epoch": 0.7372693726937269,
"grad_norm": 0.4397279620170593,
"learning_rate": 1.9593872514419476e-05,
"loss": 2.7707,
"step": 9990
},
{
"epoch": 0.7380073800738007,
"grad_norm": 0.47456085681915283,
"learning_rate": 1.9491721293812076e-05,
"loss": 2.9205,
"step": 10000
},
{
"epoch": 0.7387453874538745,
"grad_norm": 0.43729913234710693,
"learning_rate": 1.9389772539903122e-05,
"loss": 2.8423,
"step": 10010
},
{
"epoch": 0.7394833948339483,
"grad_norm": 0.4417737126350403,
"learning_rate": 1.9288026929270587e-05,
"loss": 2.832,
"step": 10020
},
{
"epoch": 0.7402214022140221,
"grad_norm": 0.44813665747642517,
"learning_rate": 1.9186485137144218e-05,
"loss": 2.8494,
"step": 10030
},
{
"epoch": 0.7409594095940959,
"grad_norm": 0.45640864968299866,
"learning_rate": 1.908514783740114e-05,
"loss": 2.8784,
"step": 10040
},
{
"epoch": 0.7416974169741697,
"grad_norm": 0.4336318373680115,
"learning_rate": 1.8984015702561393e-05,
"loss": 2.8372,
"step": 10050
},
{
"epoch": 0.7424354243542436,
"grad_norm": 0.4504336714744568,
"learning_rate": 1.8883089403783434e-05,
"loss": 2.7967,
"step": 10060
},
{
"epoch": 0.7431734317343174,
"grad_norm": 0.46149566769599915,
"learning_rate": 1.8782369610859708e-05,
"loss": 2.8191,
"step": 10070
},
{
"epoch": 0.7439114391143912,
"grad_norm": 0.4522392451763153,
"learning_rate": 1.868185699221221e-05,
"loss": 2.8794,
"step": 10080
},
{
"epoch": 0.744649446494465,
"grad_norm": 0.4411635994911194,
"learning_rate": 1.8581552214887977e-05,
"loss": 2.8404,
"step": 10090
},
{
"epoch": 0.7453874538745388,
"grad_norm": 0.46107056736946106,
"learning_rate": 1.848145594455477e-05,
"loss": 2.846,
"step": 10100
},
{
"epoch": 0.7461254612546125,
"grad_norm": 0.45308247208595276,
"learning_rate": 1.8381568845496578e-05,
"loss": 2.807,
"step": 10110
},
{
"epoch": 0.7468634686346863,
"grad_norm": 0.44377437233924866,
"learning_rate": 1.828189158060927e-05,
"loss": 2.9005,
"step": 10120
},
{
"epoch": 0.7476014760147601,
"grad_norm": 0.45314696431159973,
"learning_rate": 1.8182424811396133e-05,
"loss": 2.8626,
"step": 10130
},
{
"epoch": 0.7483394833948339,
"grad_norm": 0.4458778202533722,
"learning_rate": 1.80831691979635e-05,
"loss": 2.7985,
"step": 10140
},
{
"epoch": 0.7490774907749077,
"grad_norm": 0.464269757270813,
"learning_rate": 1.7984125399016392e-05,
"loss": 2.9386,
"step": 10150
},
{
"epoch": 0.7498154981549815,
"grad_norm": 0.4448395371437073,
"learning_rate": 1.7885294071854157e-05,
"loss": 2.833,
"step": 10160
},
{
"epoch": 0.7505535055350554,
"grad_norm": 0.4455287754535675,
"learning_rate": 1.7786675872366028e-05,
"loss": 2.8184,
"step": 10170
},
{
"epoch": 0.7512915129151292,
"grad_norm": 0.4467598497867584,
"learning_rate": 1.7688271455026867e-05,
"loss": 2.8357,
"step": 10180
},
{
"epoch": 0.752029520295203,
"grad_norm": 0.4642166197299957,
"learning_rate": 1.7590081472892776e-05,
"loss": 2.9219,
"step": 10190
},
{
"epoch": 0.7527675276752768,
"grad_norm": 0.44453924894332886,
"learning_rate": 1.7492106577596772e-05,
"loss": 2.8822,
"step": 10200
},
{
"epoch": 0.7535055350553506,
"grad_norm": 0.4599774479866028,
"learning_rate": 1.7394347419344432e-05,
"loss": 2.8336,
"step": 10210
},
{
"epoch": 0.7542435424354244,
"grad_norm": 0.4477832317352295,
"learning_rate": 1.7296804646909654e-05,
"loss": 2.785,
"step": 10220
},
{
"epoch": 0.7549815498154981,
"grad_norm": 0.45672887563705444,
"learning_rate": 1.7199478907630267e-05,
"loss": 2.8166,
"step": 10230
},
{
"epoch": 0.7557195571955719,
"grad_norm": 0.4571615159511566,
"learning_rate": 1.710237084740378e-05,
"loss": 2.9199,
"step": 10240
},
{
"epoch": 0.7564575645756457,
"grad_norm": 0.4618014991283417,
"learning_rate": 1.7005481110683062e-05,
"loss": 2.907,
"step": 10250
},
{
"epoch": 0.7571955719557195,
"grad_norm": 0.44089266657829285,
"learning_rate": 1.690881034047212e-05,
"loss": 2.854,
"step": 10260
},
{
"epoch": 0.7579335793357933,
"grad_norm": 0.4468059837818146,
"learning_rate": 1.6812359178321784e-05,
"loss": 2.8511,
"step": 10270
},
{
"epoch": 0.7586715867158672,
"grad_norm": 0.4517216682434082,
"learning_rate": 1.6716128264325475e-05,
"loss": 2.8117,
"step": 10280
},
{
"epoch": 0.759409594095941,
"grad_norm": 0.4576111137866974,
"learning_rate": 1.662011823711495e-05,
"loss": 2.838,
"step": 10290
},
{
"epoch": 0.7601476014760148,
"grad_norm": 0.4355645179748535,
"learning_rate": 1.6524329733856047e-05,
"loss": 2.8054,
"step": 10300
},
{
"epoch": 0.7608856088560886,
"grad_norm": 0.4544225037097931,
"learning_rate": 1.642876339024446e-05,
"loss": 2.8703,
"step": 10310
},
{
"epoch": 0.7616236162361624,
"grad_norm": 0.4510670006275177,
"learning_rate": 1.633341984050162e-05,
"loss": 2.8265,
"step": 10320
},
{
"epoch": 0.7623616236162362,
"grad_norm": 0.444296658039093,
"learning_rate": 1.6238299717370252e-05,
"loss": 2.9467,
"step": 10330
},
{
"epoch": 0.76309963099631,
"grad_norm": 0.44352987408638,
"learning_rate": 1.614340365211044e-05,
"loss": 2.8385,
"step": 10340
},
{
"epoch": 0.7638376383763837,
"grad_norm": 0.4408433139324188,
"learning_rate": 1.6048732274495255e-05,
"loss": 2.7828,
"step": 10350
},
{
"epoch": 0.7645756457564575,
"grad_norm": 0.4516165554523468,
"learning_rate": 1.595428621280668e-05,
"loss": 2.8448,
"step": 10360
},
{
"epoch": 0.7653136531365313,
"grad_norm": 0.4665060341358185,
"learning_rate": 1.5860066093831367e-05,
"loss": 2.8067,
"step": 10370
},
{
"epoch": 0.7660516605166051,
"grad_norm": 0.44463926553726196,
"learning_rate": 1.5766072542856526e-05,
"loss": 2.8421,
"step": 10380
},
{
"epoch": 0.766789667896679,
"grad_norm": 0.426488995552063,
"learning_rate": 1.5672306183665764e-05,
"loss": 2.8121,
"step": 10390
},
{
"epoch": 0.7675276752767528,
"grad_norm": 0.44521549344062805,
"learning_rate": 1.557876763853493e-05,
"loss": 2.7992,
"step": 10400
},
{
"epoch": 0.7682656826568266,
"grad_norm": 0.45438680052757263,
"learning_rate": 1.5485457528228003e-05,
"loss": 2.8034,
"step": 10410
},
{
"epoch": 0.7690036900369004,
"grad_norm": 0.4456971287727356,
"learning_rate": 1.5392376471992965e-05,
"loss": 2.8191,
"step": 10420
},
{
"epoch": 0.7697416974169742,
"grad_norm": 0.4459834694862366,
"learning_rate": 1.529952508755768e-05,
"loss": 2.8668,
"step": 10430
},
{
"epoch": 0.770479704797048,
"grad_norm": 0.4495449960231781,
"learning_rate": 1.5206903991125832e-05,
"loss": 2.8433,
"step": 10440
},
{
"epoch": 0.7712177121771218,
"grad_norm": 0.4536389708518982,
"learning_rate": 1.511451379737278e-05,
"loss": 2.8522,
"step": 10450
},
{
"epoch": 0.7719557195571956,
"grad_norm": 0.44112926721572876,
"learning_rate": 1.502235511944154e-05,
"loss": 2.872,
"step": 10460
},
{
"epoch": 0.7726937269372693,
"grad_norm": 0.43305230140686035,
"learning_rate": 1.4930428568938648e-05,
"loss": 2.901,
"step": 10470
},
{
"epoch": 0.7734317343173431,
"grad_norm": 0.4792589247226715,
"learning_rate": 1.4838734755930167e-05,
"loss": 2.7635,
"step": 10480
},
{
"epoch": 0.7741697416974169,
"grad_norm": 0.4358636438846588,
"learning_rate": 1.4747274288937596e-05,
"loss": 2.8276,
"step": 10490
},
{
"epoch": 0.7749077490774908,
"grad_norm": 0.44949012994766235,
"learning_rate": 1.4656047774933874e-05,
"loss": 2.8624,
"step": 10500
},
{
"epoch": 0.7756457564575646,
"grad_norm": 0.4440300762653351,
"learning_rate": 1.4565055819339235e-05,
"loss": 2.8239,
"step": 10510
},
{
"epoch": 0.7763837638376384,
"grad_norm": 0.4554462730884552,
"learning_rate": 1.447429902601739e-05,
"loss": 2.7734,
"step": 10520
},
{
"epoch": 0.7771217712177122,
"grad_norm": 0.4523858428001404,
"learning_rate": 1.4383777997271347e-05,
"loss": 2.8976,
"step": 10530
},
{
"epoch": 0.777859778597786,
"grad_norm": 0.46444228291511536,
"learning_rate": 1.429349333383948e-05,
"loss": 2.8756,
"step": 10540
},
{
"epoch": 0.7785977859778598,
"grad_norm": 0.4419015347957611,
"learning_rate": 1.4203445634891538e-05,
"loss": 2.8626,
"step": 10550
},
{
"epoch": 0.7793357933579336,
"grad_norm": 0.44527843594551086,
"learning_rate": 1.4113635498024664e-05,
"loss": 2.8063,
"step": 10560
},
{
"epoch": 0.7800738007380074,
"grad_norm": 0.4554080665111542,
"learning_rate": 1.4024063519259439e-05,
"loss": 2.7555,
"step": 10570
},
{
"epoch": 0.7808118081180812,
"grad_norm": 0.4289720952510834,
"learning_rate": 1.3934730293035936e-05,
"loss": 2.8304,
"step": 10580
},
{
"epoch": 0.7815498154981549,
"grad_norm": 0.4606097936630249,
"learning_rate": 1.38456364122097e-05,
"loss": 2.8415,
"step": 10590
},
{
"epoch": 0.7822878228782287,
"grad_norm": 0.4606861174106598,
"learning_rate": 1.3756782468047936e-05,
"loss": 2.889,
"step": 10600
},
{
"epoch": 0.7830258302583026,
"grad_norm": 0.425731897354126,
"learning_rate": 1.3668169050225472e-05,
"loss": 2.8573,
"step": 10610
},
{
"epoch": 0.7837638376383764,
"grad_norm": 0.4634413421154022,
"learning_rate": 1.357979674682095e-05,
"loss": 2.8677,
"step": 10620
},
{
"epoch": 0.7845018450184502,
"grad_norm": 0.45793548226356506,
"learning_rate": 1.349166614431282e-05,
"loss": 2.9207,
"step": 10630
},
{
"epoch": 0.785239852398524,
"grad_norm": 0.4642331898212433,
"learning_rate": 1.3403777827575514e-05,
"loss": 2.887,
"step": 10640
},
{
"epoch": 0.7859778597785978,
"grad_norm": 0.4591294825077057,
"learning_rate": 1.3316132379875551e-05,
"loss": 2.8502,
"step": 10650
},
{
"epoch": 0.7867158671586716,
"grad_norm": 0.4461764395236969,
"learning_rate": 1.322873038286766e-05,
"loss": 2.8357,
"step": 10660
},
{
"epoch": 0.7874538745387454,
"grad_norm": 0.4518667757511139,
"learning_rate": 1.3141572416590891e-05,
"loss": 2.9274,
"step": 10670
},
{
"epoch": 0.7881918819188192,
"grad_norm": 0.435041606426239,
"learning_rate": 1.3054659059464835e-05,
"loss": 2.7578,
"step": 10680
},
{
"epoch": 0.788929889298893,
"grad_norm": 0.45000597834587097,
"learning_rate": 1.2967990888285737e-05,
"loss": 2.8792,
"step": 10690
},
{
"epoch": 0.7896678966789668,
"grad_norm": 0.4507540464401245,
"learning_rate": 1.2881568478222672e-05,
"loss": 2.9286,
"step": 10700
},
{
"epoch": 0.7904059040590405,
"grad_norm": 0.44547247886657715,
"learning_rate": 1.2795392402813715e-05,
"loss": 2.7792,
"step": 10710
},
{
"epoch": 0.7911439114391144,
"grad_norm": 0.4526568353176117,
"learning_rate": 1.2709463233962204e-05,
"loss": 2.8923,
"step": 10720
},
{
"epoch": 0.7918819188191882,
"grad_norm": 0.4650912284851074,
"learning_rate": 1.262378154193285e-05,
"loss": 2.7767,
"step": 10730
},
{
"epoch": 0.792619926199262,
"grad_norm": 0.4619973301887512,
"learning_rate": 1.2538347895348013e-05,
"loss": 2.7074,
"step": 10740
},
{
"epoch": 0.7933579335793358,
"grad_norm": 0.4545031487941742,
"learning_rate": 1.2453162861183909e-05,
"loss": 2.832,
"step": 10750
},
{
"epoch": 0.7940959409594096,
"grad_norm": 0.45016980171203613,
"learning_rate": 1.236822700476683e-05,
"loss": 2.8709,
"step": 10760
},
{
"epoch": 0.7948339483394834,
"grad_norm": 0.41397857666015625,
"learning_rate": 1.2283540889769445e-05,
"loss": 2.7864,
"step": 10770
},
{
"epoch": 0.7955719557195572,
"grad_norm": 0.47167348861694336,
"learning_rate": 1.2199105078207001e-05,
"loss": 2.7768,
"step": 10780
},
{
"epoch": 0.796309963099631,
"grad_norm": 0.46366357803344727,
"learning_rate": 1.2114920130433644e-05,
"loss": 2.8994,
"step": 10790
},
{
"epoch": 0.7970479704797048,
"grad_norm": 0.4539276957511902,
"learning_rate": 1.2030986605138644e-05,
"loss": 2.8526,
"step": 10800
},
{
"epoch": 0.7977859778597786,
"grad_norm": 0.430576354265213,
"learning_rate": 1.1947305059342729e-05,
"loss": 2.7993,
"step": 10810
},
{
"epoch": 0.7985239852398524,
"grad_norm": 0.4400356113910675,
"learning_rate": 1.1863876048394407e-05,
"loss": 2.9068,
"step": 10820
},
{
"epoch": 0.7992619926199263,
"grad_norm": 0.44879478216171265,
"learning_rate": 1.1780700125966233e-05,
"loss": 2.8591,
"step": 10830
},
{
"epoch": 0.8,
"grad_norm": 0.44169095158576965,
"learning_rate": 1.1697777844051105e-05,
"loss": 2.793,
"step": 10840
},
{
"epoch": 0.8007380073800738,
"grad_norm": 0.45461106300354004,
"learning_rate": 1.1615109752958713e-05,
"loss": 2.9182,
"step": 10850
},
{
"epoch": 0.8014760147601476,
"grad_norm": 0.4425186812877655,
"learning_rate": 1.1532696401311787e-05,
"loss": 2.8754,
"step": 10860
},
{
"epoch": 0.8022140221402214,
"grad_norm": 0.4334977865219116,
"learning_rate": 1.1450538336042516e-05,
"loss": 2.8037,
"step": 10870
},
{
"epoch": 0.8029520295202952,
"grad_norm": 0.43513453006744385,
"learning_rate": 1.1368636102388868e-05,
"loss": 2.8548,
"step": 10880
},
{
"epoch": 0.803690036900369,
"grad_norm": 0.4428231716156006,
"learning_rate": 1.1286990243891011e-05,
"loss": 2.8673,
"step": 10890
},
{
"epoch": 0.8044280442804428,
"grad_norm": 0.4509079158306122,
"learning_rate": 1.1205601302387692e-05,
"loss": 2.9012,
"step": 10900
},
{
"epoch": 0.8051660516605166,
"grad_norm": 0.44838449358940125,
"learning_rate": 1.1124469818012635e-05,
"loss": 2.8056,
"step": 10910
},
{
"epoch": 0.8059040590405904,
"grad_norm": 0.4536844491958618,
"learning_rate": 1.1043596329190964e-05,
"loss": 2.883,
"step": 10920
},
{
"epoch": 0.8066420664206642,
"grad_norm": 0.44634494185447693,
"learning_rate": 1.0962981372635628e-05,
"loss": 2.8049,
"step": 10930
},
{
"epoch": 0.8073800738007381,
"grad_norm": 0.4615216553211212,
"learning_rate": 1.0882625483343845e-05,
"loss": 2.9058,
"step": 10940
},
{
"epoch": 0.8081180811808119,
"grad_norm": 0.4436852037906647,
"learning_rate": 1.0802529194593547e-05,
"loss": 2.8492,
"step": 10950
},
{
"epoch": 0.8088560885608856,
"grad_norm": 0.4358108341693878,
"learning_rate": 1.0722693037939818e-05,
"loss": 2.8513,
"step": 10960
},
{
"epoch": 0.8095940959409594,
"grad_norm": 0.45849135518074036,
"learning_rate": 1.0643117543211422e-05,
"loss": 2.8141,
"step": 10970
},
{
"epoch": 0.8103321033210332,
"grad_norm": 0.4694216251373291,
"learning_rate": 1.0563803238507219e-05,
"loss": 2.8304,
"step": 10980
},
{
"epoch": 0.811070110701107,
"grad_norm": 0.4531688094139099,
"learning_rate": 1.0484750650192726e-05,
"loss": 2.9128,
"step": 10990
},
{
"epoch": 0.8118081180811808,
"grad_norm": 0.4585440754890442,
"learning_rate": 1.0405960302896562e-05,
"loss": 2.8299,
"step": 11000
},
{
"epoch": 0.8125461254612546,
"grad_norm": 0.4274667799472809,
"learning_rate": 1.0327432719507019e-05,
"loss": 2.7979,
"step": 11010
},
{
"epoch": 0.8132841328413284,
"grad_norm": 0.43614691495895386,
"learning_rate": 1.0249168421168558e-05,
"loss": 2.8119,
"step": 11020
},
{
"epoch": 0.8140221402214022,
"grad_norm": 0.45556968450546265,
"learning_rate": 1.0171167927278368e-05,
"loss": 2.9038,
"step": 11030
},
{
"epoch": 0.814760147601476,
"grad_norm": 0.44112008810043335,
"learning_rate": 1.0093431755482908e-05,
"loss": 2.9019,
"step": 11040
},
{
"epoch": 0.8154981549815498,
"grad_norm": 0.444204181432724,
"learning_rate": 1.001596042167447e-05,
"loss": 2.7909,
"step": 11050
},
{
"epoch": 0.8162361623616237,
"grad_norm": 0.427478551864624,
"learning_rate": 9.93875443998778e-06,
"loss": 2.8195,
"step": 11060
},
{
"epoch": 0.8169741697416975,
"grad_norm": 0.4325047433376312,
"learning_rate": 9.861814322796553e-06,
"loss": 2.8227,
"step": 11070
},
{
"epoch": 0.8177121771217712,
"grad_norm": 0.4463500380516052,
"learning_rate": 9.785140580710107e-06,
"loss": 2.8502,
"step": 11080
},
{
"epoch": 0.818450184501845,
"grad_norm": 0.44314101338386536,
"learning_rate": 9.708733722569996e-06,
"loss": 2.8617,
"step": 11090
},
{
"epoch": 0.8191881918819188,
"grad_norm": 0.43770846724510193,
"learning_rate": 9.632594255446565e-06,
"loss": 2.815,
"step": 11100
},
{
"epoch": 0.8199261992619926,
"grad_norm": 0.48664426803588867,
"learning_rate": 9.556722684635667e-06,
"loss": 2.8386,
"step": 11110
},
{
"epoch": 0.8206642066420664,
"grad_norm": 0.42718470096588135,
"learning_rate": 9.48111951365529e-06,
"loss": 2.7743,
"step": 11120
},
{
"epoch": 0.8214022140221402,
"grad_norm": 0.4534224569797516,
"learning_rate": 9.405785244242165e-06,
"loss": 2.885,
"step": 11130
},
{
"epoch": 0.822140221402214,
"grad_norm": 0.4469706118106842,
"learning_rate": 9.330720376348483e-06,
"loss": 2.7431,
"step": 11140
},
{
"epoch": 0.8228782287822878,
"grad_norm": 0.4499460756778717,
"learning_rate": 9.25592540813857e-06,
"loss": 2.8604,
"step": 11150
},
{
"epoch": 0.8236162361623616,
"grad_norm": 0.4386638104915619,
"learning_rate": 9.18140083598557e-06,
"loss": 2.797,
"step": 11160
},
{
"epoch": 0.8243542435424355,
"grad_norm": 0.4377821683883667,
"learning_rate": 9.10714715446817e-06,
"loss": 2.8071,
"step": 11170
},
{
"epoch": 0.8250922509225093,
"grad_norm": 0.4503236413002014,
"learning_rate": 9.03316485636727e-06,
"loss": 2.8215,
"step": 11180
},
{
"epoch": 0.825830258302583,
"grad_norm": 0.4537326693534851,
"learning_rate": 8.959454432662778e-06,
"loss": 2.7938,
"step": 11190
},
{
"epoch": 0.8265682656826568,
"grad_norm": 0.4477526843547821,
"learning_rate": 8.88601637253032e-06,
"loss": 2.7778,
"step": 11200
},
{
"epoch": 0.8273062730627306,
"grad_norm": 0.45014604926109314,
"learning_rate": 8.812851163337975e-06,
"loss": 2.792,
"step": 11210
},
{
"epoch": 0.8280442804428044,
"grad_norm": 0.44553130865097046,
"learning_rate": 8.739959290643097e-06,
"loss": 2.8268,
"step": 11220
},
{
"epoch": 0.8287822878228782,
"grad_norm": 0.45030757784843445,
"learning_rate": 8.667341238189009e-06,
"loss": 2.8332,
"step": 11230
},
{
"epoch": 0.829520295202952,
"grad_norm": 0.44522371888160706,
"learning_rate": 8.594997487901879e-06,
"loss": 2.8526,
"step": 11240
},
{
"epoch": 0.8302583025830258,
"grad_norm": 0.46951159834861755,
"learning_rate": 8.522928519887463e-06,
"loss": 2.8052,
"step": 11250
},
{
"epoch": 0.8309963099630996,
"grad_norm": 0.45531222224235535,
"learning_rate": 8.451134812427925e-06,
"loss": 2.8108,
"step": 11260
},
{
"epoch": 0.8317343173431734,
"grad_norm": 0.4519606828689575,
"learning_rate": 8.379616841978699e-06,
"loss": 2.8302,
"step": 11270
},
{
"epoch": 0.8324723247232473,
"grad_norm": 0.45735597610473633,
"learning_rate": 8.308375083165298e-06,
"loss": 2.9323,
"step": 11280
},
{
"epoch": 0.8332103321033211,
"grad_norm": 0.4518982172012329,
"learning_rate": 8.237410008780161e-06,
"loss": 2.796,
"step": 11290
},
{
"epoch": 0.8339483394833949,
"grad_norm": 0.4294179379940033,
"learning_rate": 8.166722089779539e-06,
"loss": 2.8383,
"step": 11300
},
{
"epoch": 0.8346863468634687,
"grad_norm": 0.43325817584991455,
"learning_rate": 8.096311795280331e-06,
"loss": 2.7896,
"step": 11310
},
{
"epoch": 0.8354243542435424,
"grad_norm": 0.4492734670639038,
"learning_rate": 8.026179592557037e-06,
"loss": 2.8272,
"step": 11320
},
{
"epoch": 0.8361623616236162,
"grad_norm": 0.4338243007659912,
"learning_rate": 7.956325947038584e-06,
"loss": 2.8173,
"step": 11330
},
{
"epoch": 0.83690036900369,
"grad_norm": 0.4449402987957001,
"learning_rate": 7.886751322305247e-06,
"loss": 2.8244,
"step": 11340
},
{
"epoch": 0.8376383763837638,
"grad_norm": 0.44180235266685486,
"learning_rate": 7.817456180085636e-06,
"loss": 2.8902,
"step": 11350
},
{
"epoch": 0.8383763837638376,
"grad_norm": 0.45504215359687805,
"learning_rate": 7.748440980253562e-06,
"loss": 2.8344,
"step": 11360
},
{
"epoch": 0.8391143911439114,
"grad_norm": 0.4654461443424225,
"learning_rate": 7.67970618082503e-06,
"loss": 2.8335,
"step": 11370
},
{
"epoch": 0.8398523985239852,
"grad_norm": 0.47360721230506897,
"learning_rate": 7.611252237955169e-06,
"loss": 2.8943,
"step": 11380
},
{
"epoch": 0.8405904059040591,
"grad_norm": 0.4570152461528778,
"learning_rate": 7.543079605935221e-06,
"loss": 2.8674,
"step": 11390
},
{
"epoch": 0.8413284132841329,
"grad_norm": 0.41285139322280884,
"learning_rate": 7.47518873718952e-06,
"loss": 2.8292,
"step": 11400
},
{
"epoch": 0.8420664206642067,
"grad_norm": 0.45135176181793213,
"learning_rate": 7.407580082272492e-06,
"loss": 2.7573,
"step": 11410
},
{
"epoch": 0.8428044280442805,
"grad_norm": 0.4763992726802826,
"learning_rate": 7.340254089865672e-06,
"loss": 2.8902,
"step": 11420
},
{
"epoch": 0.8435424354243543,
"grad_norm": 0.480816513299942,
"learning_rate": 7.27321120677471e-06,
"loss": 2.9058,
"step": 11430
},
{
"epoch": 0.844280442804428,
"grad_norm": 0.4476820230484009,
"learning_rate": 7.206451877926418e-06,
"loss": 2.8191,
"step": 11440
},
{
"epoch": 0.8450184501845018,
"grad_norm": 0.4477422833442688,
"learning_rate": 7.139976546365817e-06,
"loss": 2.8023,
"step": 11450
},
{
"epoch": 0.8457564575645756,
"grad_norm": 0.4407312572002411,
"learning_rate": 7.0737856532531895e-06,
"loss": 2.8368,
"step": 11460
},
{
"epoch": 0.8464944649446494,
"grad_norm": 0.45549750328063965,
"learning_rate": 7.007879637861159e-06,
"loss": 2.8561,
"step": 11470
},
{
"epoch": 0.8472324723247232,
"grad_norm": 0.4288015067577362,
"learning_rate": 6.942258937571772e-06,
"loss": 2.7234,
"step": 11480
},
{
"epoch": 0.847970479704797,
"grad_norm": 0.4370770752429962,
"learning_rate": 6.87692398787359e-06,
"loss": 2.8607,
"step": 11490
},
{
"epoch": 0.8487084870848709,
"grad_norm": 0.44784659147262573,
"learning_rate": 6.81187522235881e-06,
"loss": 2.78,
"step": 11500
},
{
"epoch": 0.8494464944649447,
"grad_norm": 0.43501320481300354,
"learning_rate": 6.747113072720385e-06,
"loss": 2.8121,
"step": 11510
},
{
"epoch": 0.8501845018450185,
"grad_norm": 0.4419308006763458,
"learning_rate": 6.6826379687491505e-06,
"loss": 2.8502,
"step": 11520
},
{
"epoch": 0.8509225092250923,
"grad_norm": 0.4417872130870819,
"learning_rate": 6.6184503383309784e-06,
"loss": 2.8042,
"step": 11530
},
{
"epoch": 0.8516605166051661,
"grad_norm": 0.4433625638484955,
"learning_rate": 6.5545506074439325e-06,
"loss": 2.7962,
"step": 11540
},
{
"epoch": 0.8523985239852399,
"grad_norm": 0.44587311148643494,
"learning_rate": 6.490939200155449e-06,
"loss": 2.841,
"step": 11550
},
{
"epoch": 0.8531365313653136,
"grad_norm": 0.4439995288848877,
"learning_rate": 6.427616538619524e-06,
"loss": 2.8195,
"step": 11560
},
{
"epoch": 0.8538745387453874,
"grad_norm": 0.4364805519580841,
"learning_rate": 6.3645830430739015e-06,
"loss": 2.7775,
"step": 11570
},
{
"epoch": 0.8546125461254612,
"grad_norm": 0.4607424736022949,
"learning_rate": 6.301839131837284e-06,
"loss": 2.907,
"step": 11580
},
{
"epoch": 0.855350553505535,
"grad_norm": 0.45834723114967346,
"learning_rate": 6.239385221306587e-06,
"loss": 2.8708,
"step": 11590
},
{
"epoch": 0.8560885608856088,
"grad_norm": 0.43934082984924316,
"learning_rate": 6.177221725954102e-06,
"loss": 2.8159,
"step": 11600
},
{
"epoch": 0.8568265682656827,
"grad_norm": 0.4437257945537567,
"learning_rate": 6.1153490583248265e-06,
"loss": 2.8734,
"step": 11610
},
{
"epoch": 0.8575645756457565,
"grad_norm": 0.43929627537727356,
"learning_rate": 6.053767629033713e-06,
"loss": 2.874,
"step": 11620
},
{
"epoch": 0.8583025830258303,
"grad_norm": 0.4439617097377777,
"learning_rate": 5.992477846762895e-06,
"loss": 2.8252,
"step": 11630
},
{
"epoch": 0.8590405904059041,
"grad_norm": 0.4464716613292694,
"learning_rate": 5.931480118259003e-06,
"loss": 2.78,
"step": 11640
},
{
"epoch": 0.8597785977859779,
"grad_norm": 0.43279653787612915,
"learning_rate": 5.870774848330485e-06,
"loss": 2.749,
"step": 11650
},
{
"epoch": 0.8605166051660517,
"grad_norm": 0.4490513503551483,
"learning_rate": 5.810362439844896e-06,
"loss": 2.841,
"step": 11660
},
{
"epoch": 0.8612546125461255,
"grad_norm": 0.4711556136608124,
"learning_rate": 5.750243293726226e-06,
"loss": 2.7801,
"step": 11670
},
{
"epoch": 0.8619926199261992,
"grad_norm": 0.4525899887084961,
"learning_rate": 5.690417808952242e-06,
"loss": 2.8942,
"step": 11680
},
{
"epoch": 0.862730627306273,
"grad_norm": 0.44727823138237,
"learning_rate": 5.6308863825518425e-06,
"loss": 2.8095,
"step": 11690
},
{
"epoch": 0.8634686346863468,
"grad_norm": 0.43965160846710205,
"learning_rate": 5.571649409602436e-06,
"loss": 2.8073,
"step": 11700
},
{
"epoch": 0.8642066420664206,
"grad_norm": 0.45212361216545105,
"learning_rate": 5.512707283227275e-06,
"loss": 2.8849,
"step": 11710
},
{
"epoch": 0.8649446494464945,
"grad_norm": 0.4664202332496643,
"learning_rate": 5.454060394592919e-06,
"loss": 2.8199,
"step": 11720
},
{
"epoch": 0.8656826568265683,
"grad_norm": 0.4387909471988678,
"learning_rate": 5.395709132906568e-06,
"loss": 2.8372,
"step": 11730
},
{
"epoch": 0.8664206642066421,
"grad_norm": 0.4543474018573761,
"learning_rate": 5.337653885413513e-06,
"loss": 2.8331,
"step": 11740
},
{
"epoch": 0.8671586715867159,
"grad_norm": 0.45128577947616577,
"learning_rate": 5.279895037394566e-06,
"loss": 2.8062,
"step": 11750
},
{
"epoch": 0.8678966789667897,
"grad_norm": 0.4404621422290802,
"learning_rate": 5.222432972163482e-06,
"loss": 2.9088,
"step": 11760
},
{
"epoch": 0.8686346863468635,
"grad_norm": 0.4398937225341797,
"learning_rate": 5.165268071064455e-06,
"loss": 2.7826,
"step": 11770
},
{
"epoch": 0.8693726937269373,
"grad_norm": 0.4395955204963684,
"learning_rate": 5.108400713469546e-06,
"loss": 2.8196,
"step": 11780
},
{
"epoch": 0.870110701107011,
"grad_norm": 0.43461933732032776,
"learning_rate": 5.051831276776203e-06,
"loss": 2.8663,
"step": 11790
},
{
"epoch": 0.8708487084870848,
"grad_norm": 0.4447794258594513,
"learning_rate": 4.995560136404709e-06,
"loss": 2.8519,
"step": 11800
},
{
"epoch": 0.8715867158671586,
"grad_norm": 0.4266679286956787,
"learning_rate": 4.939587665795736e-06,
"loss": 2.8062,
"step": 11810
},
{
"epoch": 0.8723247232472324,
"grad_norm": 0.4411248564720154,
"learning_rate": 4.88391423640786e-06,
"loss": 2.8758,
"step": 11820
},
{
"epoch": 0.8730627306273063,
"grad_norm": 0.44381076097488403,
"learning_rate": 4.828540217715066e-06,
"loss": 2.7979,
"step": 11830
},
{
"epoch": 0.8738007380073801,
"grad_norm": 0.44569119811058044,
"learning_rate": 4.773465977204311e-06,
"loss": 2.8081,
"step": 11840
},
{
"epoch": 0.8745387453874539,
"grad_norm": 0.48127833008766174,
"learning_rate": 4.718691880373094e-06,
"loss": 2.8617,
"step": 11850
},
{
"epoch": 0.8752767527675277,
"grad_norm": 0.45613643527030945,
"learning_rate": 4.664218290727035e-06,
"loss": 2.8187,
"step": 11860
},
{
"epoch": 0.8760147601476015,
"grad_norm": 0.440491646528244,
"learning_rate": 4.610045569777444e-06,
"loss": 2.8023,
"step": 11870
},
{
"epoch": 0.8767527675276753,
"grad_norm": 0.4358707368373871,
"learning_rate": 4.5561740770389275e-06,
"loss": 2.8102,
"step": 11880
},
{
"epoch": 0.8774907749077491,
"grad_norm": 0.43503841757774353,
"learning_rate": 4.502604170027019e-06,
"loss": 2.8204,
"step": 11890
},
{
"epoch": 0.8782287822878229,
"grad_norm": 0.4486919343471527,
"learning_rate": 4.449336204255777e-06,
"loss": 2.8827,
"step": 11900
},
{
"epoch": 0.8789667896678967,
"grad_norm": 0.43869447708129883,
"learning_rate": 4.396370533235455e-06,
"loss": 2.8374,
"step": 11910
},
{
"epoch": 0.8797047970479704,
"grad_norm": 0.45128440856933594,
"learning_rate": 4.343707508470135e-06,
"loss": 2.8906,
"step": 11920
},
{
"epoch": 0.8804428044280442,
"grad_norm": 0.46216467022895813,
"learning_rate": 4.291347479455405e-06,
"loss": 2.8381,
"step": 11930
},
{
"epoch": 0.8811808118081181,
"grad_norm": 0.4366297721862793,
"learning_rate": 4.2392907936760265e-06,
"loss": 2.8183,
"step": 11940
},
{
"epoch": 0.8819188191881919,
"grad_norm": 0.45038753747940063,
"learning_rate": 4.187537796603658e-06,
"loss": 2.7906,
"step": 11950
},
{
"epoch": 0.8826568265682657,
"grad_norm": 0.45959797501564026,
"learning_rate": 4.136088831694524e-06,
"loss": 2.8724,
"step": 11960
},
{
"epoch": 0.8833948339483395,
"grad_norm": 0.4413219392299652,
"learning_rate": 4.084944240387168e-06,
"loss": 2.8541,
"step": 11970
},
{
"epoch": 0.8841328413284133,
"grad_norm": 0.47469910979270935,
"learning_rate": 4.034104362100155e-06,
"loss": 2.9288,
"step": 11980
},
{
"epoch": 0.8848708487084871,
"grad_norm": 0.43708014488220215,
"learning_rate": 3.983569534229864e-06,
"loss": 2.7833,
"step": 11990
},
{
"epoch": 0.8856088560885609,
"grad_norm": 0.44569307565689087,
"learning_rate": 3.933340092148202e-06,
"loss": 2.8684,
"step": 12000
},
{
"epoch": 0.8863468634686347,
"grad_norm": 0.462568998336792,
"learning_rate": 3.883416369200399e-06,
"loss": 2.8399,
"step": 12010
},
{
"epoch": 0.8870848708487085,
"grad_norm": 0.4384634494781494,
"learning_rate": 3.8337986967028e-06,
"loss": 2.837,
"step": 12020
},
{
"epoch": 0.8878228782287823,
"grad_norm": 0.46717679500579834,
"learning_rate": 3.7844874039406674e-06,
"loss": 2.8523,
"step": 12030
},
{
"epoch": 0.888560885608856,
"grad_norm": 0.4314653277397156,
"learning_rate": 3.7354828181659695e-06,
"loss": 2.8815,
"step": 12040
},
{
"epoch": 0.8892988929889298,
"grad_norm": 0.43344810605049133,
"learning_rate": 3.6867852645952494e-06,
"loss": 2.7918,
"step": 12050
},
{
"epoch": 0.8900369003690037,
"grad_norm": 0.46255967020988464,
"learning_rate": 3.6383950664074405e-06,
"loss": 2.8106,
"step": 12060
},
{
"epoch": 0.8907749077490775,
"grad_norm": 0.44985824823379517,
"learning_rate": 3.5903125447417196e-06,
"loss": 2.8244,
"step": 12070
},
{
"epoch": 0.8915129151291513,
"grad_norm": 0.441011518239975,
"learning_rate": 3.5425380186953904e-06,
"loss": 2.8061,
"step": 12080
},
{
"epoch": 0.8922509225092251,
"grad_norm": 0.4453372359275818,
"learning_rate": 3.495071805321759e-06,
"loss": 2.9384,
"step": 12090
},
{
"epoch": 0.8929889298892989,
"grad_norm": 0.43761390447616577,
"learning_rate": 3.447914219628029e-06,
"loss": 2.7863,
"step": 12100
},
{
"epoch": 0.8937269372693727,
"grad_norm": 0.4433492124080658,
"learning_rate": 3.4010655745731865e-06,
"loss": 2.8553,
"step": 12110
},
{
"epoch": 0.8944649446494465,
"grad_norm": 0.43299391865730286,
"learning_rate": 3.354526181066003e-06,
"loss": 2.7823,
"step": 12120
},
{
"epoch": 0.8952029520295203,
"grad_norm": 0.45678773522377014,
"learning_rate": 3.308296347962875e-06,
"loss": 2.7281,
"step": 12130
},
{
"epoch": 0.8959409594095941,
"grad_norm": 0.4413972795009613,
"learning_rate": 3.2623763820658237e-06,
"loss": 2.8478,
"step": 12140
},
{
"epoch": 0.8966789667896679,
"grad_norm": 0.44608476758003235,
"learning_rate": 3.2167665881204567e-06,
"loss": 2.7823,
"step": 12150
},
{
"epoch": 0.8974169741697416,
"grad_norm": 0.4420614242553711,
"learning_rate": 3.171467268813938e-06,
"loss": 2.8281,
"step": 12160
},
{
"epoch": 0.8981549815498155,
"grad_norm": 0.4385377764701843,
"learning_rate": 3.1264787247729908e-06,
"loss": 2.7918,
"step": 12170
},
{
"epoch": 0.8988929889298893,
"grad_norm": 0.44008246064186096,
"learning_rate": 3.0818012545618835e-06,
"loss": 2.793,
"step": 12180
},
{
"epoch": 0.8996309963099631,
"grad_norm": 0.44634199142456055,
"learning_rate": 3.0374351546804514e-06,
"loss": 2.7829,
"step": 12190
},
{
"epoch": 0.9003690036900369,
"grad_norm": 0.4375803768634796,
"learning_rate": 2.9933807195621445e-06,
"loss": 2.8107,
"step": 12200
},
{
"epoch": 0.9011070110701107,
"grad_norm": 0.4388578534126282,
"learning_rate": 2.9496382415720723e-06,
"loss": 2.8524,
"step": 12210
},
{
"epoch": 0.9018450184501845,
"grad_norm": 0.43253517150878906,
"learning_rate": 2.9062080110050515e-06,
"loss": 2.8215,
"step": 12220
},
{
"epoch": 0.9025830258302583,
"grad_norm": 0.4246656894683838,
"learning_rate": 2.8630903160836773e-06,
"loss": 2.835,
"step": 12230
},
{
"epoch": 0.9033210332103321,
"grad_norm": 0.4635641872882843,
"learning_rate": 2.820285442956422e-06,
"loss": 2.829,
"step": 12240
},
{
"epoch": 0.9040590405904059,
"grad_norm": 0.4323824644088745,
"learning_rate": 2.7777936756957333e-06,
"loss": 2.7945,
"step": 12250
},
{
"epoch": 0.9047970479704797,
"grad_norm": 0.4489029347896576,
"learning_rate": 2.7356152962961567e-06,
"loss": 2.8904,
"step": 12260
},
{
"epoch": 0.9055350553505535,
"grad_norm": 0.4545091390609741,
"learning_rate": 2.6937505846724165e-06,
"loss": 2.8889,
"step": 12270
},
{
"epoch": 0.9062730627306274,
"grad_norm": 0.4438563585281372,
"learning_rate": 2.6521998186576357e-06,
"loss": 2.836,
"step": 12280
},
{
"epoch": 0.9070110701107011,
"grad_norm": 0.4264052212238312,
"learning_rate": 2.610963274001438e-06,
"loss": 2.7639,
"step": 12290
},
{
"epoch": 0.9077490774907749,
"grad_norm": 0.4508605897426605,
"learning_rate": 2.5700412243681417e-06,
"loss": 2.7735,
"step": 12300
},
{
"epoch": 0.9084870848708487,
"grad_norm": 0.4573262929916382,
"learning_rate": 2.5294339413349076e-06,
"loss": 2.8901,
"step": 12310
},
{
"epoch": 0.9092250922509225,
"grad_norm": 0.4440000057220459,
"learning_rate": 2.4891416943900014e-06,
"loss": 2.8662,
"step": 12320
},
{
"epoch": 0.9099630996309963,
"grad_norm": 0.4513186812400818,
"learning_rate": 2.449164750930938e-06,
"loss": 2.8268,
"step": 12330
},
{
"epoch": 0.9107011070110701,
"grad_norm": 0.43622398376464844,
"learning_rate": 2.409503376262762e-06,
"loss": 2.8246,
"step": 12340
},
{
"epoch": 0.9114391143911439,
"grad_norm": 0.44066351652145386,
"learning_rate": 2.3701578335962206e-06,
"loss": 2.7924,
"step": 12350
},
{
"epoch": 0.9121771217712177,
"grad_norm": 0.4405202269554138,
"learning_rate": 2.3311283840460994e-06,
"loss": 2.8639,
"step": 12360
},
{
"epoch": 0.9129151291512915,
"grad_norm": 0.4488193094730377,
"learning_rate": 2.292415286629418e-06,
"loss": 2.8531,
"step": 12370
},
{
"epoch": 0.9136531365313653,
"grad_norm": 0.4245339632034302,
"learning_rate": 2.254018798263763e-06,
"loss": 2.8349,
"step": 12380
},
{
"epoch": 0.9143911439114392,
"grad_norm": 0.43623387813568115,
"learning_rate": 2.2159391737655466e-06,
"loss": 2.8225,
"step": 12390
},
{
"epoch": 0.915129151291513,
"grad_norm": 0.4482229948043823,
"learning_rate": 2.1781766658483303e-06,
"loss": 2.7716,
"step": 12400
},
{
"epoch": 0.9158671586715867,
"grad_norm": 0.450795441865921,
"learning_rate": 2.1407315251211422e-06,
"loss": 2.7796,
"step": 12410
},
{
"epoch": 0.9166051660516605,
"grad_norm": 0.45314326882362366,
"learning_rate": 2.103604000086856e-06,
"loss": 2.8009,
"step": 12420
},
{
"epoch": 0.9173431734317343,
"grad_norm": 0.44693273305892944,
"learning_rate": 2.066794337140443e-06,
"loss": 2.8486,
"step": 12430
},
{
"epoch": 0.9180811808118081,
"grad_norm": 0.43216079473495483,
"learning_rate": 2.0303027805674445e-06,
"loss": 2.7234,
"step": 12440
},
{
"epoch": 0.9188191881918819,
"grad_norm": 0.45111674070358276,
"learning_rate": 1.994129572542286e-06,
"loss": 2.7963,
"step": 12450
},
{
"epoch": 0.9195571955719557,
"grad_norm": 0.46144166588783264,
"learning_rate": 1.958274953126693e-06,
"loss": 2.8314,
"step": 12460
},
{
"epoch": 0.9202952029520295,
"grad_norm": 0.45646706223487854,
"learning_rate": 1.922739160268089e-06,
"loss": 2.8796,
"step": 12470
},
{
"epoch": 0.9210332103321033,
"grad_norm": 0.49224853515625,
"learning_rate": 1.8875224297980332e-06,
"loss": 2.7904,
"step": 12480
},
{
"epoch": 0.9217712177121771,
"grad_norm": 0.44804316759109497,
"learning_rate": 1.8526249954306241e-06,
"loss": 2.7583,
"step": 12490
},
{
"epoch": 0.922509225092251,
"grad_norm": 0.43229466676712036,
"learning_rate": 1.8180470887609769e-06,
"loss": 2.8608,
"step": 12500
},
{
"epoch": 0.9232472324723248,
"grad_norm": 0.43958374857902527,
"learning_rate": 1.7837889392636864e-06,
"loss": 2.8282,
"step": 12510
},
{
"epoch": 0.9239852398523986,
"grad_norm": 0.4417596459388733,
"learning_rate": 1.7498507742912784e-06,
"loss": 2.8048,
"step": 12520
},
{
"epoch": 0.9247232472324723,
"grad_norm": 0.4306926429271698,
"learning_rate": 1.7162328190727217e-06,
"loss": 2.8095,
"step": 12530
},
{
"epoch": 0.9254612546125461,
"grad_norm": 0.439455509185791,
"learning_rate": 1.682935296711935e-06,
"loss": 2.7822,
"step": 12540
},
{
"epoch": 0.9261992619926199,
"grad_norm": 0.4519449472427368,
"learning_rate": 1.6499584281862935e-06,
"loss": 2.8494,
"step": 12550
},
{
"epoch": 0.9269372693726937,
"grad_norm": 0.4483802318572998,
"learning_rate": 1.6173024323451747e-06,
"loss": 2.8629,
"step": 12560
},
{
"epoch": 0.9276752767527675,
"grad_norm": 0.4460211396217346,
"learning_rate": 1.5849675259084872e-06,
"loss": 2.8258,
"step": 12570
},
{
"epoch": 0.9284132841328413,
"grad_norm": 0.43958115577697754,
"learning_rate": 1.5529539234652668e-06,
"loss": 2.8093,
"step": 12580
},
{
"epoch": 0.9291512915129151,
"grad_norm": 0.46250835061073303,
"learning_rate": 1.5212618374722155e-06,
"loss": 2.828,
"step": 12590
},
{
"epoch": 0.9298892988929889,
"grad_norm": 0.46097636222839355,
"learning_rate": 1.4898914782523143e-06,
"loss": 2.8305,
"step": 12600
},
{
"epoch": 0.9306273062730628,
"grad_norm": 0.4385923445224762,
"learning_rate": 1.458843053993403e-06,
"loss": 2.7875,
"step": 12610
},
{
"epoch": 0.9313653136531366,
"grad_norm": 0.44254031777381897,
"learning_rate": 1.4281167707468457e-06,
"loss": 2.8113,
"step": 12620
},
{
"epoch": 0.9321033210332104,
"grad_norm": 0.4598987102508545,
"learning_rate": 1.3977128324261068e-06,
"loss": 2.8511,
"step": 12630
},
{
"epoch": 0.9328413284132842,
"grad_norm": 0.4526178240776062,
"learning_rate": 1.3676314408054391e-06,
"loss": 2.7979,
"step": 12640
},
{
"epoch": 0.933579335793358,
"grad_norm": 0.45094090700149536,
"learning_rate": 1.3378727955185244e-06,
"loss": 2.8319,
"step": 12650
},
{
"epoch": 0.9343173431734317,
"grad_norm": 0.45027512311935425,
"learning_rate": 1.3084370940571577e-06,
"loss": 2.8245,
"step": 12660
},
{
"epoch": 0.9350553505535055,
"grad_norm": 0.4329124391078949,
"learning_rate": 1.2793245317699321e-06,
"loss": 2.7542,
"step": 12670
},
{
"epoch": 0.9357933579335793,
"grad_norm": 0.4586227536201477,
"learning_rate": 1.2505353018609444e-06,
"loss": 2.7729,
"step": 12680
},
{
"epoch": 0.9365313653136531,
"grad_norm": 0.4397171437740326,
"learning_rate": 1.2220695953885031e-06,
"loss": 2.8164,
"step": 12690
},
{
"epoch": 0.9372693726937269,
"grad_norm": 0.4415930211544037,
"learning_rate": 1.1939276012638723e-06,
"loss": 2.8644,
"step": 12700
},
{
"epoch": 0.9380073800738007,
"grad_norm": 0.43980923295021057,
"learning_rate": 1.1661095062500237e-06,
"loss": 2.8716,
"step": 12710
},
{
"epoch": 0.9387453874538746,
"grad_norm": 0.46194180846214294,
"learning_rate": 1.1386154949603934e-06,
"loss": 2.8307,
"step": 12720
},
{
"epoch": 0.9394833948339484,
"grad_norm": 0.4496355652809143,
"learning_rate": 1.1114457498576258e-06,
"loss": 2.7868,
"step": 12730
},
{
"epoch": 0.9402214022140222,
"grad_norm": 0.4483359456062317,
"learning_rate": 1.0846004512524211e-06,
"loss": 2.8357,
"step": 12740
},
{
"epoch": 0.940959409594096,
"grad_norm": 0.44404512643814087,
"learning_rate": 1.0580797773022733e-06,
"loss": 2.8843,
"step": 12750
},
{
"epoch": 0.9416974169741698,
"grad_norm": 0.4440787136554718,
"learning_rate": 1.03188390401035e-06,
"loss": 2.8038,
"step": 12760
},
{
"epoch": 0.9424354243542435,
"grad_norm": 0.4445192813873291,
"learning_rate": 1.006013005224271e-06,
"loss": 2.813,
"step": 12770
},
{
"epoch": 0.9431734317343173,
"grad_norm": 0.4234587550163269,
"learning_rate": 9.80467252634998e-07,
"loss": 2.8414,
"step": 12780
},
{
"epoch": 0.9439114391143911,
"grad_norm": 0.4393916726112366,
"learning_rate": 9.552468157756622e-07,
"loss": 2.7851,
"step": 12790
},
{
"epoch": 0.9446494464944649,
"grad_norm": 0.4591200053691864,
"learning_rate": 9.303518620204677e-07,
"loss": 2.8378,
"step": 12800
},
{
"epoch": 0.9453874538745387,
"grad_norm": 0.43322470784187317,
"learning_rate": 9.057825565835399e-07,
"loss": 2.7366,
"step": 12810
},
{
"epoch": 0.9461254612546125,
"grad_norm": 0.4324533939361572,
"learning_rate": 8.815390625178887e-07,
"loss": 2.7483,
"step": 12820
},
{
"epoch": 0.9468634686346864,
"grad_norm": 0.4632011950016022,
"learning_rate": 8.576215407142651e-07,
"loss": 2.7874,
"step": 12830
},
{
"epoch": 0.9476014760147602,
"grad_norm": 0.4332893490791321,
"learning_rate": 8.340301499001446e-07,
"loss": 2.8252,
"step": 12840
},
{
"epoch": 0.948339483394834,
"grad_norm": 0.436294287443161,
"learning_rate": 8.107650466386285e-07,
"loss": 2.8445,
"step": 12850
},
{
"epoch": 0.9490774907749078,
"grad_norm": 0.43967026472091675,
"learning_rate": 7.878263853274281e-07,
"loss": 2.8411,
"step": 12860
},
{
"epoch": 0.9498154981549816,
"grad_norm": 0.45120909810066223,
"learning_rate": 7.652143181978655e-07,
"loss": 2.8118,
"step": 12870
},
{
"epoch": 0.9505535055350554,
"grad_norm": 0.4368390738964081,
"learning_rate": 7.429289953138019e-07,
"loss": 2.8086,
"step": 12880
},
{
"epoch": 0.9512915129151291,
"grad_norm": 0.4452465772628784,
"learning_rate": 7.209705645706944e-07,
"loss": 2.8468,
"step": 12890
},
{
"epoch": 0.9520295202952029,
"grad_norm": 0.4445231258869171,
"learning_rate": 6.993391716946019e-07,
"loss": 2.8114,
"step": 12900
},
{
"epoch": 0.9527675276752767,
"grad_norm": 0.43402281403541565,
"learning_rate": 6.780349602411918e-07,
"loss": 2.8352,
"step": 12910
},
{
"epoch": 0.9535055350553505,
"grad_norm": 0.45803192257881165,
"learning_rate": 6.570580715948404e-07,
"loss": 2.8013,
"step": 12920
},
{
"epoch": 0.9542435424354243,
"grad_norm": 0.45193520188331604,
"learning_rate": 6.364086449676232e-07,
"loss": 2.8368,
"step": 12930
},
{
"epoch": 0.9549815498154982,
"grad_norm": 0.44040247797966003,
"learning_rate": 6.160868173984591e-07,
"loss": 2.8559,
"step": 12940
},
{
"epoch": 0.955719557195572,
"grad_norm": 0.4719098210334778,
"learning_rate": 5.960927237521563e-07,
"loss": 2.85,
"step": 12950
},
{
"epoch": 0.9564575645756458,
"grad_norm": 0.4502539336681366,
"learning_rate": 5.764264967185462e-07,
"loss": 2.9074,
"step": 12960
},
{
"epoch": 0.9571955719557196,
"grad_norm": 0.4299696683883667,
"learning_rate": 5.570882668115784e-07,
"loss": 2.7595,
"step": 12970
},
{
"epoch": 0.9579335793357934,
"grad_norm": 0.44181373715400696,
"learning_rate": 5.380781623684661e-07,
"loss": 2.8024,
"step": 12980
},
{
"epoch": 0.9586715867158672,
"grad_norm": 0.437763512134552,
"learning_rate": 5.193963095488419e-07,
"loss": 2.8231,
"step": 12990
},
{
"epoch": 0.959409594095941,
"grad_norm": 0.4234910011291504,
"learning_rate": 5.010428323339033e-07,
"loss": 2.8898,
"step": 13000
},
{
"epoch": 0.9601476014760147,
"grad_norm": 0.45260801911354065,
"learning_rate": 4.830178525256079e-07,
"loss": 2.8558,
"step": 13010
},
{
"epoch": 0.9608856088560885,
"grad_norm": 0.4440422058105469,
"learning_rate": 4.653214897458513e-07,
"loss": 2.8007,
"step": 13020
},
{
"epoch": 0.9616236162361623,
"grad_norm": 0.4362104833126068,
"learning_rate": 4.4795386143567374e-07,
"loss": 2.8271,
"step": 13030
},
{
"epoch": 0.9623616236162361,
"grad_norm": 0.44079768657684326,
"learning_rate": 4.309150828544939e-07,
"loss": 2.8371,
"step": 13040
},
{
"epoch": 0.9630996309963099,
"grad_norm": 0.46145325899124146,
"learning_rate": 4.1420526707933727e-07,
"loss": 2.8808,
"step": 13050
},
{
"epoch": 0.9638376383763838,
"grad_norm": 0.4297032058238983,
"learning_rate": 3.978245250040702e-07,
"loss": 2.8506,
"step": 13060
},
{
"epoch": 0.9645756457564576,
"grad_norm": 0.4474189579486847,
"learning_rate": 3.817729653386892e-07,
"loss": 2.8261,
"step": 13070
},
{
"epoch": 0.9653136531365314,
"grad_norm": 0.43458986282348633,
"learning_rate": 3.660506946085829e-07,
"loss": 2.8319,
"step": 13080
},
{
"epoch": 0.9660516605166052,
"grad_norm": 0.4418502151966095,
"learning_rate": 3.506578171538377e-07,
"loss": 2.8326,
"step": 13090
},
{
"epoch": 0.966789667896679,
"grad_norm": 0.4373183846473694,
"learning_rate": 3.355944351285278e-07,
"loss": 2.7896,
"step": 13100
},
{
"epoch": 0.9675276752767528,
"grad_norm": 0.4467260241508484,
"learning_rate": 3.2086064850004314e-07,
"loss": 2.8499,
"step": 13110
},
{
"epoch": 0.9682656826568266,
"grad_norm": 0.45079532265663147,
"learning_rate": 3.064565550484455e-07,
"loss": 2.8005,
"step": 13120
},
{
"epoch": 0.9690036900369003,
"grad_norm": 0.4311223328113556,
"learning_rate": 2.9238225036579693e-07,
"loss": 2.8419,
"step": 13130
},
{
"epoch": 0.9697416974169741,
"grad_norm": 0.4524695575237274,
"learning_rate": 2.7863782785552685e-07,
"loss": 2.8581,
"step": 13140
},
{
"epoch": 0.9704797047970479,
"grad_norm": 0.4483130872249603,
"learning_rate": 2.65223378731827e-07,
"loss": 2.8275,
"step": 13150
},
{
"epoch": 0.9712177121771217,
"grad_norm": 0.4370816946029663,
"learning_rate": 2.521389920190298e-07,
"loss": 2.8673,
"step": 13160
},
{
"epoch": 0.9719557195571956,
"grad_norm": 0.444195032119751,
"learning_rate": 2.3938475455103083e-07,
"loss": 2.9407,
"step": 13170
},
{
"epoch": 0.9726937269372694,
"grad_norm": 0.44004592299461365,
"learning_rate": 2.269607509707006e-07,
"loss": 2.8481,
"step": 13180
},
{
"epoch": 0.9734317343173432,
"grad_norm": 0.44630327820777893,
"learning_rate": 2.1486706372932375e-07,
"loss": 2.7954,
"step": 13190
},
{
"epoch": 0.974169741697417,
"grad_norm": 0.42796429991722107,
"learning_rate": 2.031037730860774e-07,
"loss": 2.8533,
"step": 13200
},
{
"epoch": 0.9749077490774908,
"grad_norm": 0.4611528217792511,
"learning_rate": 1.916709571074482e-07,
"loss": 2.8151,
"step": 13210
},
{
"epoch": 0.9756457564575646,
"grad_norm": 0.451028972864151,
"learning_rate": 1.8056869166677703e-07,
"loss": 2.8355,
"step": 13220
},
{
"epoch": 0.9763837638376384,
"grad_norm": 0.4451844096183777,
"learning_rate": 1.6979705044369297e-07,
"loss": 2.8121,
"step": 13230
},
{
"epoch": 0.9771217712177122,
"grad_norm": 0.4613220989704132,
"learning_rate": 1.5935610492366915e-07,
"loss": 2.9067,
"step": 13240
},
{
"epoch": 0.977859778597786,
"grad_norm": 0.44495347142219543,
"learning_rate": 1.4924592439753416e-07,
"loss": 2.7666,
"step": 13250
},
{
"epoch": 0.9785977859778597,
"grad_norm": 0.4585348963737488,
"learning_rate": 1.394665759610003e-07,
"loss": 2.7254,
"step": 13260
},
{
"epoch": 0.9793357933579335,
"grad_norm": 0.43729352951049805,
"learning_rate": 1.3001812451423068e-07,
"loss": 2.778,
"step": 13270
},
{
"epoch": 0.9800738007380074,
"grad_norm": 0.450089693069458,
"learning_rate": 1.209006327614226e-07,
"loss": 2.809,
"step": 13280
},
{
"epoch": 0.9808118081180812,
"grad_norm": 0.43959712982177734,
"learning_rate": 1.1211416121035823e-07,
"loss": 2.8325,
"step": 13290
},
{
"epoch": 0.981549815498155,
"grad_norm": 0.4504597783088684,
"learning_rate": 1.036587681720269e-07,
"loss": 2.7841,
"step": 13300
},
{
"epoch": 0.9822878228782288,
"grad_norm": 0.44741228222846985,
"learning_rate": 9.55345097602256e-08,
"loss": 2.8358,
"step": 13310
},
{
"epoch": 0.9830258302583026,
"grad_norm": 0.4463639557361603,
"learning_rate": 8.774143989119798e-08,
"loss": 2.8313,
"step": 13320
},
{
"epoch": 0.9837638376383764,
"grad_norm": 0.4775594472885132,
"learning_rate": 8.027961028328479e-08,
"loss": 2.8781,
"step": 13330
},
{
"epoch": 0.9845018450184502,
"grad_norm": 0.4243060350418091,
"learning_rate": 7.314907045653519e-08,
"loss": 2.7926,
"step": 13340
},
{
"epoch": 0.985239852398524,
"grad_norm": 0.43475958704948425,
"learning_rate": 6.634986773244034e-08,
"loss": 2.7885,
"step": 13350
},
{
"epoch": 0.9859778597785978,
"grad_norm": 0.4415262043476105,
"learning_rate": 5.988204723356705e-08,
"loss": 2.7721,
"step": 13360
},
{
"epoch": 0.9867158671586715,
"grad_norm": 0.438672810792923,
"learning_rate": 5.374565188329683e-08,
"loss": 2.8138,
"step": 13370
},
{
"epoch": 0.9874538745387453,
"grad_norm": 0.46068814396858215,
"learning_rate": 4.794072240550951e-08,
"loss": 2.7988,
"step": 13380
},
{
"epoch": 0.9881918819188192,
"grad_norm": 0.44185954332351685,
"learning_rate": 4.246729732434451e-08,
"loss": 2.7823,
"step": 13390
},
{
"epoch": 0.988929889298893,
"grad_norm": 0.4282056391239166,
"learning_rate": 3.7325412963912235e-08,
"loss": 2.872,
"step": 13400
},
{
"epoch": 0.9896678966789668,
"grad_norm": 0.46537652611732483,
"learning_rate": 3.251510344807751e-08,
"loss": 2.9374,
"step": 13410
},
{
"epoch": 0.9904059040590406,
"grad_norm": 0.4430101215839386,
"learning_rate": 2.8036400700232058e-08,
"loss": 2.7839,
"step": 13420
},
{
"epoch": 0.9911439114391144,
"grad_norm": 0.45416316390037537,
"learning_rate": 2.3889334443055744e-08,
"loss": 2.8689,
"step": 13430
},
{
"epoch": 0.9918819188191882,
"grad_norm": 0.4388124346733093,
"learning_rate": 2.007393219836118e-08,
"loss": 2.9239,
"step": 13440
},
{
"epoch": 0.992619926199262,
"grad_norm": 0.43018996715545654,
"learning_rate": 1.6590219286871655e-08,
"loss": 2.8412,
"step": 13450
},
{
"epoch": 0.9933579335793358,
"grad_norm": 0.42218539118766785,
"learning_rate": 1.3438218828076832e-08,
"loss": 2.7462,
"step": 13460
},
{
"epoch": 0.9940959409594096,
"grad_norm": 0.4494752883911133,
"learning_rate": 1.0617951740077292e-08,
"loss": 2.8598,
"step": 13470
},
{
"epoch": 0.9948339483394834,
"grad_norm": 0.41235294938087463,
"learning_rate": 8.12943673943467e-09,
"loss": 2.8083,
"step": 13480
},
{
"epoch": 0.9955719557195571,
"grad_norm": 0.4434475004673004,
"learning_rate": 5.9726903410661786e-09,
"loss": 2.929,
"step": 13490
},
{
"epoch": 0.996309963099631,
"grad_norm": 0.43739476799964905,
"learning_rate": 4.147726858100276e-09,
"loss": 2.844,
"step": 13500
},
{
"epoch": 0.9970479704797048,
"grad_norm": 0.46633192896842957,
"learning_rate": 2.6545584018211613e-09,
"loss": 2.8096,
"step": 13510
},
{
"epoch": 0.9977859778597786,
"grad_norm": 0.4500004053115845,
"learning_rate": 1.4931948815744e-09,
"loss": 2.8317,
"step": 13520
},
{
"epoch": 0.9985239852398524,
"grad_norm": 0.45538780093193054,
"learning_rate": 6.636440046892123e-10,
"loss": 2.8792,
"step": 13530
},
{
"epoch": 0.9992619926199262,
"grad_norm": 0.4632636308670044,
"learning_rate": 1.6591127643961202e-10,
"loss": 2.8205,
"step": 13540
},
{
"epoch": 1.0,
"grad_norm": 0.4356023073196411,
"learning_rate": 0.0,
"loss": 2.8161,
"step": 13550
},
{
"epoch": 1.0,
"step": 13550,
"total_flos": 5.404563590201999e+18,
"train_loss": 3.236852196415412,
"train_runtime": 292848.6684,
"train_samples_per_second": 0.74,
"train_steps_per_second": 0.046
}
],
"logging_steps": 10,
"max_steps": 13550,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 4000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.404563590201999e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}