amasia-test-jvurzlcg / trainer_state.json
conan1024hao's picture
Upload folder using huggingface_hub
91368cd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3472222222222222,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006944444444444445,
"grad_norm": 0.38245001435279846,
"learning_rate": 9.090909090909091e-07,
"loss": 1.5265,
"step": 1
},
{
"epoch": 0.001388888888888889,
"grad_norm": 0.36873823404312134,
"learning_rate": 1.8181818181818183e-06,
"loss": 1.5149,
"step": 2
},
{
"epoch": 0.0020833333333333333,
"grad_norm": 0.31270918250083923,
"learning_rate": 2.7272727272727272e-06,
"loss": 1.478,
"step": 3
},
{
"epoch": 0.002777777777777778,
"grad_norm": 0.21180707216262817,
"learning_rate": 3.6363636363636366e-06,
"loss": 1.4297,
"step": 4
},
{
"epoch": 0.003472222222222222,
"grad_norm": 0.10717298090457916,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.3724,
"step": 5
},
{
"epoch": 0.004166666666666667,
"grad_norm": 0.12929688394069672,
"learning_rate": 5.4545454545454545e-06,
"loss": 1.3816,
"step": 6
},
{
"epoch": 0.004861111111111111,
"grad_norm": 0.10623333603143692,
"learning_rate": 6.363636363636364e-06,
"loss": 1.3061,
"step": 7
},
{
"epoch": 0.005555555555555556,
"grad_norm": 0.07657431811094284,
"learning_rate": 7.272727272727273e-06,
"loss": 1.2736,
"step": 8
},
{
"epoch": 0.00625,
"grad_norm": 0.05140658840537071,
"learning_rate": 8.181818181818183e-06,
"loss": 1.2767,
"step": 9
},
{
"epoch": 0.006944444444444444,
"grad_norm": 0.04951579496264458,
"learning_rate": 9.090909090909091e-06,
"loss": 1.2659,
"step": 10
},
{
"epoch": 0.007638888888888889,
"grad_norm": 0.035326242446899414,
"learning_rate": 1e-05,
"loss": 1.1883,
"step": 11
},
{
"epoch": 0.008333333333333333,
"grad_norm": 0.03807157278060913,
"learning_rate": 1.0909090909090909e-05,
"loss": 1.2452,
"step": 12
},
{
"epoch": 0.009027777777777777,
"grad_norm": 0.03424696624279022,
"learning_rate": 1.181818181818182e-05,
"loss": 1.2394,
"step": 13
},
{
"epoch": 0.009722222222222222,
"grad_norm": 0.026057492941617966,
"learning_rate": 1.2727272727272728e-05,
"loss": 1.157,
"step": 14
},
{
"epoch": 0.010416666666666666,
"grad_norm": 0.024454891681671143,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.1901,
"step": 15
},
{
"epoch": 0.011111111111111112,
"grad_norm": 0.02439827099442482,
"learning_rate": 1.4545454545454546e-05,
"loss": 1.1669,
"step": 16
},
{
"epoch": 0.011805555555555555,
"grad_norm": 0.023934854194521904,
"learning_rate": 1.5454545454545454e-05,
"loss": 1.1833,
"step": 17
},
{
"epoch": 0.0125,
"grad_norm": 0.023052480071783066,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.1777,
"step": 18
},
{
"epoch": 0.013194444444444444,
"grad_norm": 0.0216195210814476,
"learning_rate": 1.7272727272727274e-05,
"loss": 1.1541,
"step": 19
},
{
"epoch": 0.013888888888888888,
"grad_norm": 0.021930918097496033,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.1519,
"step": 20
},
{
"epoch": 0.014583333333333334,
"grad_norm": 0.01956241950392723,
"learning_rate": 1.9090909090909094e-05,
"loss": 1.1289,
"step": 21
},
{
"epoch": 0.015277777777777777,
"grad_norm": 0.019718188792467117,
"learning_rate": 2e-05,
"loss": 1.12,
"step": 22
},
{
"epoch": 0.01597222222222222,
"grad_norm": 0.018330449238419533,
"learning_rate": 2.090909090909091e-05,
"loss": 1.151,
"step": 23
},
{
"epoch": 0.016666666666666666,
"grad_norm": 0.019966386258602142,
"learning_rate": 2.1818181818181818e-05,
"loss": 1.1717,
"step": 24
},
{
"epoch": 0.017361111111111112,
"grad_norm": 0.019648971036076546,
"learning_rate": 2.2727272727272733e-05,
"loss": 1.1188,
"step": 25
},
{
"epoch": 0.018055555555555554,
"grad_norm": 0.018150752410292625,
"learning_rate": 2.363636363636364e-05,
"loss": 1.0927,
"step": 26
},
{
"epoch": 0.01875,
"grad_norm": 0.02000141702592373,
"learning_rate": 2.454545454545455e-05,
"loss": 1.1205,
"step": 27
},
{
"epoch": 0.019444444444444445,
"grad_norm": 0.02152254991233349,
"learning_rate": 2.5454545454545457e-05,
"loss": 1.1136,
"step": 28
},
{
"epoch": 0.02013888888888889,
"grad_norm": 0.016942821443080902,
"learning_rate": 2.6363636363636365e-05,
"loss": 1.1073,
"step": 29
},
{
"epoch": 0.020833333333333332,
"grad_norm": 0.018401362001895905,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.0745,
"step": 30
},
{
"epoch": 0.021527777777777778,
"grad_norm": 0.01856987737119198,
"learning_rate": 2.8181818181818185e-05,
"loss": 1.0699,
"step": 31
},
{
"epoch": 0.022222222222222223,
"grad_norm": 0.014833790250122547,
"learning_rate": 2.9090909090909093e-05,
"loss": 1.0226,
"step": 32
},
{
"epoch": 0.022916666666666665,
"grad_norm": 0.01955495961010456,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.1012,
"step": 33
},
{
"epoch": 0.02361111111111111,
"grad_norm": 0.01699119061231613,
"learning_rate": 3.090909090909091e-05,
"loss": 1.0457,
"step": 34
},
{
"epoch": 0.024305555555555556,
"grad_norm": 0.020263031125068665,
"learning_rate": 3.181818181818182e-05,
"loss": 1.0581,
"step": 35
},
{
"epoch": 0.025,
"grad_norm": 0.015922358259558678,
"learning_rate": 3.272727272727273e-05,
"loss": 1.0958,
"step": 36
},
{
"epoch": 0.025694444444444443,
"grad_norm": 0.016595132648944855,
"learning_rate": 3.363636363636364e-05,
"loss": 1.0425,
"step": 37
},
{
"epoch": 0.02638888888888889,
"grad_norm": 0.018398325890302658,
"learning_rate": 3.454545454545455e-05,
"loss": 1.0893,
"step": 38
},
{
"epoch": 0.027083333333333334,
"grad_norm": 0.017498090863227844,
"learning_rate": 3.545454545454546e-05,
"loss": 1.0487,
"step": 39
},
{
"epoch": 0.027777777777777776,
"grad_norm": 0.016576213762164116,
"learning_rate": 3.6363636363636364e-05,
"loss": 1.0567,
"step": 40
},
{
"epoch": 0.02847222222222222,
"grad_norm": 0.013903766870498657,
"learning_rate": 3.7272727272727276e-05,
"loss": 1.023,
"step": 41
},
{
"epoch": 0.029166666666666667,
"grad_norm": 0.01546618901193142,
"learning_rate": 3.818181818181819e-05,
"loss": 1.0456,
"step": 42
},
{
"epoch": 0.029861111111111113,
"grad_norm": 0.014900722540915012,
"learning_rate": 3.909090909090909e-05,
"loss": 1.0365,
"step": 43
},
{
"epoch": 0.030555555555555555,
"grad_norm": 0.015999475494027138,
"learning_rate": 4e-05,
"loss": 1.0486,
"step": 44
},
{
"epoch": 0.03125,
"grad_norm": 0.01696275919675827,
"learning_rate": 3.999994935591541e-05,
"loss": 1.0602,
"step": 45
},
{
"epoch": 0.03194444444444444,
"grad_norm": 0.016145754605531693,
"learning_rate": 3.999979742391812e-05,
"loss": 1.0709,
"step": 46
},
{
"epoch": 0.03263888888888889,
"grad_norm": 0.015007151290774345,
"learning_rate": 3.999954420477757e-05,
"loss": 1.0273,
"step": 47
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.016809619963169098,
"learning_rate": 3.9999189699776166e-05,
"loss": 1.062,
"step": 48
},
{
"epoch": 0.034027777777777775,
"grad_norm": 0.017114873975515366,
"learning_rate": 3.9998733910709277e-05,
"loss": 1.0624,
"step": 49
},
{
"epoch": 0.034722222222222224,
"grad_norm": 0.014565072022378445,
"learning_rate": 3.9998176839885196e-05,
"loss": 1.0487,
"step": 50
},
{
"epoch": 0.035416666666666666,
"grad_norm": 0.014314558357000351,
"learning_rate": 3.9997518490125166e-05,
"loss": 1.0673,
"step": 51
},
{
"epoch": 0.03611111111111111,
"grad_norm": 0.01658693701028824,
"learning_rate": 3.999675886476332e-05,
"loss": 1.0328,
"step": 52
},
{
"epoch": 0.03680555555555556,
"grad_norm": 0.014420751482248306,
"learning_rate": 3.9995897967646725e-05,
"loss": 0.9782,
"step": 53
},
{
"epoch": 0.0375,
"grad_norm": 0.017560573294758797,
"learning_rate": 3.999493580313532e-05,
"loss": 1.0367,
"step": 54
},
{
"epoch": 0.03819444444444445,
"grad_norm": 0.017693940550088882,
"learning_rate": 3.9993872376101894e-05,
"loss": 1.0518,
"step": 55
},
{
"epoch": 0.03888888888888889,
"grad_norm": 0.014515440911054611,
"learning_rate": 3.9992707691932067e-05,
"loss": 1.0322,
"step": 56
},
{
"epoch": 0.03958333333333333,
"grad_norm": 0.015049392357468605,
"learning_rate": 3.999144175652428e-05,
"loss": 1.0323,
"step": 57
},
{
"epoch": 0.04027777777777778,
"grad_norm": 0.017379263415932655,
"learning_rate": 3.999007457628976e-05,
"loss": 1.0391,
"step": 58
},
{
"epoch": 0.04097222222222222,
"grad_norm": 0.01676369458436966,
"learning_rate": 3.998860615815246e-05,
"loss": 1.0637,
"step": 59
},
{
"epoch": 0.041666666666666664,
"grad_norm": 0.020871426910161972,
"learning_rate": 3.9987036509549034e-05,
"loss": 1.0831,
"step": 60
},
{
"epoch": 0.04236111111111111,
"grad_norm": 0.01579645834863186,
"learning_rate": 3.998536563842884e-05,
"loss": 1.0439,
"step": 61
},
{
"epoch": 0.043055555555555555,
"grad_norm": 0.021290535107254982,
"learning_rate": 3.998359355325384e-05,
"loss": 1.0859,
"step": 62
},
{
"epoch": 0.04375,
"grad_norm": 0.016678057610988617,
"learning_rate": 3.998172026299861e-05,
"loss": 1.0521,
"step": 63
},
{
"epoch": 0.044444444444444446,
"grad_norm": 0.016436951234936714,
"learning_rate": 3.997974577715026e-05,
"loss": 1.0156,
"step": 64
},
{
"epoch": 0.04513888888888889,
"grad_norm": 0.016367288306355476,
"learning_rate": 3.9977670105708377e-05,
"loss": 1.0365,
"step": 65
},
{
"epoch": 0.04583333333333333,
"grad_norm": 0.017032478004693985,
"learning_rate": 3.997549325918501e-05,
"loss": 1.068,
"step": 66
},
{
"epoch": 0.04652777777777778,
"grad_norm": 0.015650460496544838,
"learning_rate": 3.997321524860461e-05,
"loss": 1.0361,
"step": 67
},
{
"epoch": 0.04722222222222222,
"grad_norm": 0.014867709018290043,
"learning_rate": 3.997083608550395e-05,
"loss": 1.0,
"step": 68
},
{
"epoch": 0.04791666666666667,
"grad_norm": 0.015197121538221836,
"learning_rate": 3.996835578193208e-05,
"loss": 1.0488,
"step": 69
},
{
"epoch": 0.04861111111111111,
"grad_norm": 0.015954626724123955,
"learning_rate": 3.996577435045027e-05,
"loss": 1.0356,
"step": 70
},
{
"epoch": 0.049305555555555554,
"grad_norm": 0.01822635903954506,
"learning_rate": 3.996309180413195e-05,
"loss": 1.0493,
"step": 71
},
{
"epoch": 0.05,
"grad_norm": 0.014613306149840355,
"learning_rate": 3.996030815656262e-05,
"loss": 0.9882,
"step": 72
},
{
"epoch": 0.050694444444444445,
"grad_norm": 0.015572323463857174,
"learning_rate": 3.995742342183982e-05,
"loss": 1.0162,
"step": 73
},
{
"epoch": 0.05138888888888889,
"grad_norm": 0.01604314148426056,
"learning_rate": 3.9954437614573015e-05,
"loss": 1.0424,
"step": 74
},
{
"epoch": 0.052083333333333336,
"grad_norm": 0.015765074640512466,
"learning_rate": 3.9951350749883555e-05,
"loss": 1.0607,
"step": 75
},
{
"epoch": 0.05277777777777778,
"grad_norm": 0.014984416775405407,
"learning_rate": 3.994816284340459e-05,
"loss": 1.0443,
"step": 76
},
{
"epoch": 0.05347222222222222,
"grad_norm": 0.016170239076018333,
"learning_rate": 3.9944873911280976e-05,
"loss": 1.0275,
"step": 77
},
{
"epoch": 0.05416666666666667,
"grad_norm": 0.014334661886096,
"learning_rate": 3.99414839701692e-05,
"loss": 1.0175,
"step": 78
},
{
"epoch": 0.05486111111111111,
"grad_norm": 0.014810354448854923,
"learning_rate": 3.993799303723733e-05,
"loss": 1.0015,
"step": 79
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.014517512172460556,
"learning_rate": 3.993440113016485e-05,
"loss": 1.0137,
"step": 80
},
{
"epoch": 0.05625,
"grad_norm": 0.01477818377315998,
"learning_rate": 3.993070826714267e-05,
"loss": 0.9852,
"step": 81
},
{
"epoch": 0.05694444444444444,
"grad_norm": 0.014672035351395607,
"learning_rate": 3.9926914466872936e-05,
"loss": 1.0099,
"step": 82
},
{
"epoch": 0.05763888888888889,
"grad_norm": 0.015323956497013569,
"learning_rate": 3.9923019748569015e-05,
"loss": 1.0345,
"step": 83
},
{
"epoch": 0.058333333333333334,
"grad_norm": 0.014713946729898453,
"learning_rate": 3.991902413195535e-05,
"loss": 1.0318,
"step": 84
},
{
"epoch": 0.059027777777777776,
"grad_norm": 0.01328709814697504,
"learning_rate": 3.9914927637267366e-05,
"loss": 1.0084,
"step": 85
},
{
"epoch": 0.059722222222222225,
"grad_norm": 0.017166944220662117,
"learning_rate": 3.991073028525139e-05,
"loss": 1.0036,
"step": 86
},
{
"epoch": 0.06041666666666667,
"grad_norm": 0.016073813661932945,
"learning_rate": 3.990643209716454e-05,
"loss": 1.0329,
"step": 87
},
{
"epoch": 0.06111111111111111,
"grad_norm": 0.01435924507677555,
"learning_rate": 3.990203309477457e-05,
"loss": 1.0535,
"step": 88
},
{
"epoch": 0.06180555555555556,
"grad_norm": 0.014585713855922222,
"learning_rate": 3.989753330035985e-05,
"loss": 1.0016,
"step": 89
},
{
"epoch": 0.0625,
"grad_norm": 0.014899005182087421,
"learning_rate": 3.989293273670916e-05,
"loss": 1.0386,
"step": 90
},
{
"epoch": 0.06319444444444444,
"grad_norm": 0.015017863363027573,
"learning_rate": 3.988823142712165e-05,
"loss": 1.0217,
"step": 91
},
{
"epoch": 0.06388888888888888,
"grad_norm": 0.014558105729520321,
"learning_rate": 3.9883429395406666e-05,
"loss": 1.0046,
"step": 92
},
{
"epoch": 0.06458333333333334,
"grad_norm": 0.014639356173574924,
"learning_rate": 3.987852666588364e-05,
"loss": 1.0226,
"step": 93
},
{
"epoch": 0.06527777777777778,
"grad_norm": 0.015396458096802235,
"learning_rate": 3.9873523263382015e-05,
"loss": 1.0374,
"step": 94
},
{
"epoch": 0.06597222222222222,
"grad_norm": 0.014208097010850906,
"learning_rate": 3.9868419213241064e-05,
"loss": 1.0058,
"step": 95
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.013101368211209774,
"learning_rate": 3.986321454130978e-05,
"loss": 0.9642,
"step": 96
},
{
"epoch": 0.06736111111111111,
"grad_norm": 0.014994910918176174,
"learning_rate": 3.9857909273946747e-05,
"loss": 0.995,
"step": 97
},
{
"epoch": 0.06805555555555555,
"grad_norm": 0.01322688814252615,
"learning_rate": 3.985250343802e-05,
"loss": 1.0124,
"step": 98
},
{
"epoch": 0.06875,
"grad_norm": 0.014946015551686287,
"learning_rate": 3.984699706090691e-05,
"loss": 0.9908,
"step": 99
},
{
"epoch": 0.06944444444444445,
"grad_norm": 0.014106931164860725,
"learning_rate": 3.9841390170494024e-05,
"loss": 0.9985,
"step": 100
},
{
"epoch": 0.07013888888888889,
"grad_norm": 0.014384300448000431,
"learning_rate": 3.9835682795176905e-05,
"loss": 0.9727,
"step": 101
},
{
"epoch": 0.07083333333333333,
"grad_norm": 0.014217739924788475,
"learning_rate": 3.982987496386004e-05,
"loss": 0.9839,
"step": 102
},
{
"epoch": 0.07152777777777777,
"grad_norm": 0.015070730820298195,
"learning_rate": 3.982396670595668e-05,
"loss": 1.0052,
"step": 103
},
{
"epoch": 0.07222222222222222,
"grad_norm": 0.01753557287156582,
"learning_rate": 3.9817958051388624e-05,
"loss": 1.0035,
"step": 104
},
{
"epoch": 0.07291666666666667,
"grad_norm": 0.015090793371200562,
"learning_rate": 3.981184903058618e-05,
"loss": 1.0413,
"step": 105
},
{
"epoch": 0.07361111111111111,
"grad_norm": 0.017897464334964752,
"learning_rate": 3.980563967448791e-05,
"loss": 0.9938,
"step": 106
},
{
"epoch": 0.07430555555555556,
"grad_norm": 0.015342462807893753,
"learning_rate": 3.979933001454053e-05,
"loss": 1.003,
"step": 107
},
{
"epoch": 0.075,
"grad_norm": 0.01586705632507801,
"learning_rate": 3.979292008269874e-05,
"loss": 1.0052,
"step": 108
},
{
"epoch": 0.07569444444444444,
"grad_norm": 0.013492800295352936,
"learning_rate": 3.978640991142505e-05,
"loss": 1.0415,
"step": 109
},
{
"epoch": 0.0763888888888889,
"grad_norm": 0.017706014215946198,
"learning_rate": 3.9779799533689634e-05,
"loss": 0.9927,
"step": 110
},
{
"epoch": 0.07708333333333334,
"grad_norm": 0.015358942560851574,
"learning_rate": 3.9773088982970135e-05,
"loss": 1.0201,
"step": 111
},
{
"epoch": 0.07777777777777778,
"grad_norm": 0.01901993341743946,
"learning_rate": 3.9766278293251526e-05,
"loss": 1.0191,
"step": 112
},
{
"epoch": 0.07847222222222222,
"grad_norm": 0.015617494471371174,
"learning_rate": 3.9759367499025924e-05,
"loss": 1.0256,
"step": 113
},
{
"epoch": 0.07916666666666666,
"grad_norm": 0.014908060431480408,
"learning_rate": 3.9752356635292405e-05,
"loss": 0.9898,
"step": 114
},
{
"epoch": 0.0798611111111111,
"grad_norm": 0.013952597044408321,
"learning_rate": 3.974524573755686e-05,
"loss": 0.9862,
"step": 115
},
{
"epoch": 0.08055555555555556,
"grad_norm": 0.013434095308184624,
"learning_rate": 3.9738034841831776e-05,
"loss": 0.9593,
"step": 116
},
{
"epoch": 0.08125,
"grad_norm": 0.014166664332151413,
"learning_rate": 3.9730723984636064e-05,
"loss": 1.0377,
"step": 117
},
{
"epoch": 0.08194444444444444,
"grad_norm": 0.014752853661775589,
"learning_rate": 3.9723313202994904e-05,
"loss": 1.0139,
"step": 118
},
{
"epoch": 0.08263888888888889,
"grad_norm": 0.014744486659765244,
"learning_rate": 3.971580253443951e-05,
"loss": 1.012,
"step": 119
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.01442610565572977,
"learning_rate": 3.970819201700699e-05,
"loss": 1.0334,
"step": 120
},
{
"epoch": 0.08402777777777778,
"grad_norm": 0.015393883921205997,
"learning_rate": 3.970048168924009e-05,
"loss": 0.9252,
"step": 121
},
{
"epoch": 0.08472222222222223,
"grad_norm": 0.01524246297776699,
"learning_rate": 3.9692671590187093e-05,
"loss": 1.0209,
"step": 122
},
{
"epoch": 0.08541666666666667,
"grad_norm": 0.014700529165565968,
"learning_rate": 3.96847617594015e-05,
"loss": 0.9943,
"step": 123
},
{
"epoch": 0.08611111111111111,
"grad_norm": 0.014305293560028076,
"learning_rate": 3.967675223694193e-05,
"loss": 0.966,
"step": 124
},
{
"epoch": 0.08680555555555555,
"grad_norm": 0.017354033887386322,
"learning_rate": 3.966864306337189e-05,
"loss": 0.9938,
"step": 125
},
{
"epoch": 0.0875,
"grad_norm": 0.014640097506344318,
"learning_rate": 3.9660434279759536e-05,
"loss": 1.0317,
"step": 126
},
{
"epoch": 0.08819444444444445,
"grad_norm": 0.014592519961297512,
"learning_rate": 3.965212592767751e-05,
"loss": 1.002,
"step": 127
},
{
"epoch": 0.08888888888888889,
"grad_norm": 0.01544480212032795,
"learning_rate": 3.964371804920269e-05,
"loss": 0.9818,
"step": 128
},
{
"epoch": 0.08958333333333333,
"grad_norm": 0.016679253429174423,
"learning_rate": 3.9635210686916016e-05,
"loss": 0.9963,
"step": 129
},
{
"epoch": 0.09027777777777778,
"grad_norm": 0.015196239575743675,
"learning_rate": 3.962660388390224e-05,
"loss": 1.017,
"step": 130
},
{
"epoch": 0.09097222222222222,
"grad_norm": 0.01522803958505392,
"learning_rate": 3.9617897683749726e-05,
"loss": 1.0141,
"step": 131
},
{
"epoch": 0.09166666666666666,
"grad_norm": 0.014725332148373127,
"learning_rate": 3.960909213055023e-05,
"loss": 1.0029,
"step": 132
},
{
"epoch": 0.09236111111111112,
"grad_norm": 0.015328467823565006,
"learning_rate": 3.960018726889868e-05,
"loss": 1.0228,
"step": 133
},
{
"epoch": 0.09305555555555556,
"grad_norm": 0.014145839028060436,
"learning_rate": 3.959118314389291e-05,
"loss": 0.9709,
"step": 134
},
{
"epoch": 0.09375,
"grad_norm": 0.015266870148479939,
"learning_rate": 3.958207980113351e-05,
"loss": 0.9938,
"step": 135
},
{
"epoch": 0.09444444444444444,
"grad_norm": 0.014660796150565147,
"learning_rate": 3.957287728672352e-05,
"loss": 0.9958,
"step": 136
},
{
"epoch": 0.09513888888888888,
"grad_norm": 0.015213273465633392,
"learning_rate": 3.956357564726822e-05,
"loss": 1.011,
"step": 137
},
{
"epoch": 0.09583333333333334,
"grad_norm": 0.015523552894592285,
"learning_rate": 3.955417492987493e-05,
"loss": 1.0297,
"step": 138
},
{
"epoch": 0.09652777777777778,
"grad_norm": 0.013940747827291489,
"learning_rate": 3.95446751821527e-05,
"loss": 1.0099,
"step": 139
},
{
"epoch": 0.09722222222222222,
"grad_norm": 0.015171276405453682,
"learning_rate": 3.9535076452212156e-05,
"loss": 0.979,
"step": 140
},
{
"epoch": 0.09791666666666667,
"grad_norm": 0.014734701253473759,
"learning_rate": 3.952537878866517e-05,
"loss": 1.0203,
"step": 141
},
{
"epoch": 0.09861111111111111,
"grad_norm": 0.014895283617079258,
"learning_rate": 3.951558224062469e-05,
"loss": 0.9803,
"step": 142
},
{
"epoch": 0.09930555555555555,
"grad_norm": 0.015903517603874207,
"learning_rate": 3.9505686857704416e-05,
"loss": 1.0109,
"step": 143
},
{
"epoch": 0.1,
"grad_norm": 0.018048502504825592,
"learning_rate": 3.949569269001861e-05,
"loss": 1.0263,
"step": 144
},
{
"epoch": 0.10069444444444445,
"grad_norm": 0.014940551482141018,
"learning_rate": 3.948559978818184e-05,
"loss": 1.0167,
"step": 145
},
{
"epoch": 0.10138888888888889,
"grad_norm": 0.018485594540834427,
"learning_rate": 3.947540820330867e-05,
"loss": 1.021,
"step": 146
},
{
"epoch": 0.10208333333333333,
"grad_norm": 0.015115809626877308,
"learning_rate": 3.9465117987013445e-05,
"loss": 1.0245,
"step": 147
},
{
"epoch": 0.10277777777777777,
"grad_norm": 0.016594298183918,
"learning_rate": 3.9454729191410025e-05,
"loss": 0.9967,
"step": 148
},
{
"epoch": 0.10347222222222222,
"grad_norm": 0.015151984058320522,
"learning_rate": 3.944424186911152e-05,
"loss": 0.9781,
"step": 149
},
{
"epoch": 0.10416666666666667,
"grad_norm": 0.016983836889266968,
"learning_rate": 3.943365607323001e-05,
"loss": 0.975,
"step": 150
},
{
"epoch": 0.10486111111111111,
"grad_norm": 0.015473946928977966,
"learning_rate": 3.9422971857376296e-05,
"loss": 1.0259,
"step": 151
},
{
"epoch": 0.10555555555555556,
"grad_norm": 0.014182898215949535,
"learning_rate": 3.941218927565959e-05,
"loss": 1.0058,
"step": 152
},
{
"epoch": 0.10625,
"grad_norm": 0.014923619106411934,
"learning_rate": 3.9401308382687324e-05,
"loss": 0.9821,
"step": 153
},
{
"epoch": 0.10694444444444444,
"grad_norm": 0.015497357584536076,
"learning_rate": 3.9390329233564755e-05,
"loss": 1.0599,
"step": 154
},
{
"epoch": 0.1076388888888889,
"grad_norm": 0.01431487686932087,
"learning_rate": 3.9379251883894795e-05,
"loss": 1.0193,
"step": 155
},
{
"epoch": 0.10833333333333334,
"grad_norm": 0.014660484157502651,
"learning_rate": 3.9368076389777655e-05,
"loss": 0.9527,
"step": 156
},
{
"epoch": 0.10902777777777778,
"grad_norm": 0.01643138937652111,
"learning_rate": 3.9356802807810613e-05,
"loss": 1.0134,
"step": 157
},
{
"epoch": 0.10972222222222222,
"grad_norm": 0.016061700880527496,
"learning_rate": 3.934543119508769e-05,
"loss": 1.0064,
"step": 158
},
{
"epoch": 0.11041666666666666,
"grad_norm": 0.01461365632712841,
"learning_rate": 3.933396160919938e-05,
"loss": 0.9953,
"step": 159
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.01574275828897953,
"learning_rate": 3.932239410823233e-05,
"loss": 1.0223,
"step": 160
},
{
"epoch": 0.11180555555555556,
"grad_norm": 0.015788385644555092,
"learning_rate": 3.931072875076912e-05,
"loss": 0.9739,
"step": 161
},
{
"epoch": 0.1125,
"grad_norm": 0.014799153432250023,
"learning_rate": 3.929896559588786e-05,
"loss": 1.0028,
"step": 162
},
{
"epoch": 0.11319444444444444,
"grad_norm": 0.013463880866765976,
"learning_rate": 3.9287104703162005e-05,
"loss": 0.9771,
"step": 163
},
{
"epoch": 0.11388888888888889,
"grad_norm": 0.014836735092103481,
"learning_rate": 3.927514613265992e-05,
"loss": 1.0159,
"step": 164
},
{
"epoch": 0.11458333333333333,
"grad_norm": 0.013634877279400826,
"learning_rate": 3.9263089944944715e-05,
"loss": 1.0431,
"step": 165
},
{
"epoch": 0.11527777777777778,
"grad_norm": 0.013801947236061096,
"learning_rate": 3.925093620107384e-05,
"loss": 0.9892,
"step": 166
},
{
"epoch": 0.11597222222222223,
"grad_norm": 0.015041469596326351,
"learning_rate": 3.923868496259882e-05,
"loss": 1.018,
"step": 167
},
{
"epoch": 0.11666666666666667,
"grad_norm": 0.013934548944234848,
"learning_rate": 3.9226336291564935e-05,
"loss": 0.9945,
"step": 168
},
{
"epoch": 0.11736111111111111,
"grad_norm": 0.014555184170603752,
"learning_rate": 3.92138902505109e-05,
"loss": 1.0101,
"step": 169
},
{
"epoch": 0.11805555555555555,
"grad_norm": 0.014088758267462254,
"learning_rate": 3.920134690246854e-05,
"loss": 0.9859,
"step": 170
},
{
"epoch": 0.11875,
"grad_norm": 0.013208562508225441,
"learning_rate": 3.91887063109625e-05,
"loss": 0.9752,
"step": 171
},
{
"epoch": 0.11944444444444445,
"grad_norm": 0.015783201903104782,
"learning_rate": 3.91759685400099e-05,
"loss": 1.0432,
"step": 172
},
{
"epoch": 0.12013888888888889,
"grad_norm": 0.014122221618890762,
"learning_rate": 3.916313365412002e-05,
"loss": 1.0079,
"step": 173
},
{
"epoch": 0.12083333333333333,
"grad_norm": 0.014722524210810661,
"learning_rate": 3.915020171829395e-05,
"loss": 1.0195,
"step": 174
},
{
"epoch": 0.12152777777777778,
"grad_norm": 0.016117624938488007,
"learning_rate": 3.9137172798024304e-05,
"loss": 0.9712,
"step": 175
},
{
"epoch": 0.12222222222222222,
"grad_norm": 0.015223621390759945,
"learning_rate": 3.912404695929486e-05,
"loss": 1.0116,
"step": 176
},
{
"epoch": 0.12291666666666666,
"grad_norm": 0.013804764486849308,
"learning_rate": 3.9110824268580206e-05,
"loss": 0.929,
"step": 177
},
{
"epoch": 0.12361111111111112,
"grad_norm": 0.014369015581905842,
"learning_rate": 3.909750479284548e-05,
"loss": 0.983,
"step": 178
},
{
"epoch": 0.12430555555555556,
"grad_norm": 0.015310019254684448,
"learning_rate": 3.908408859954593e-05,
"loss": 0.9962,
"step": 179
},
{
"epoch": 0.125,
"grad_norm": 0.015282726846635342,
"learning_rate": 3.907057575662663e-05,
"loss": 1.0367,
"step": 180
},
{
"epoch": 0.12569444444444444,
"grad_norm": 0.013963599689304829,
"learning_rate": 3.905696633252216e-05,
"loss": 0.9506,
"step": 181
},
{
"epoch": 0.12638888888888888,
"grad_norm": 0.015924159437417984,
"learning_rate": 3.904326039615618e-05,
"loss": 0.9647,
"step": 182
},
{
"epoch": 0.12708333333333333,
"grad_norm": 0.01227070577442646,
"learning_rate": 3.902945801694117e-05,
"loss": 0.9613,
"step": 183
},
{
"epoch": 0.12777777777777777,
"grad_norm": 0.014005225151777267,
"learning_rate": 3.9015559264777994e-05,
"loss": 0.9774,
"step": 184
},
{
"epoch": 0.1284722222222222,
"grad_norm": 0.014159591868519783,
"learning_rate": 3.9001564210055624e-05,
"loss": 0.9962,
"step": 185
},
{
"epoch": 0.12916666666666668,
"grad_norm": 0.013430262915790081,
"learning_rate": 3.898747292365073e-05,
"loss": 0.9928,
"step": 186
},
{
"epoch": 0.12986111111111112,
"grad_norm": 0.012756455689668655,
"learning_rate": 3.897328547692735e-05,
"loss": 0.9652,
"step": 187
},
{
"epoch": 0.13055555555555556,
"grad_norm": 0.015474308282136917,
"learning_rate": 3.89590019417365e-05,
"loss": 1.0134,
"step": 188
},
{
"epoch": 0.13125,
"grad_norm": 0.013962026685476303,
"learning_rate": 3.8944622390415835e-05,
"loss": 1.0008,
"step": 189
},
{
"epoch": 0.13194444444444445,
"grad_norm": 0.013678076677024364,
"learning_rate": 3.893014689578928e-05,
"loss": 0.9865,
"step": 190
},
{
"epoch": 0.1326388888888889,
"grad_norm": 0.01606924645602703,
"learning_rate": 3.891557553116665e-05,
"loss": 1.0097,
"step": 191
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.014233720488846302,
"learning_rate": 3.89009083703433e-05,
"loss": 1.0165,
"step": 192
},
{
"epoch": 0.13402777777777777,
"grad_norm": 0.019174130633473396,
"learning_rate": 3.888614548759971e-05,
"loss": 1.0029,
"step": 193
},
{
"epoch": 0.13472222222222222,
"grad_norm": 0.014648901298642159,
"learning_rate": 3.8871286957701146e-05,
"loss": 0.972,
"step": 194
},
{
"epoch": 0.13541666666666666,
"grad_norm": 0.015734290704131126,
"learning_rate": 3.8856332855897286e-05,
"loss": 0.9562,
"step": 195
},
{
"epoch": 0.1361111111111111,
"grad_norm": 0.014715912751853466,
"learning_rate": 3.8841283257921794e-05,
"loss": 0.9801,
"step": 196
},
{
"epoch": 0.13680555555555557,
"grad_norm": 0.0156076829880476,
"learning_rate": 3.882613823999199e-05,
"loss": 1.0438,
"step": 197
},
{
"epoch": 0.1375,
"grad_norm": 0.014719419181346893,
"learning_rate": 3.881089787880843e-05,
"loss": 1.0208,
"step": 198
},
{
"epoch": 0.13819444444444445,
"grad_norm": 0.015471681021153927,
"learning_rate": 3.879556225155453e-05,
"loss": 0.9728,
"step": 199
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.013889987021684647,
"learning_rate": 3.878013143589617e-05,
"loss": 0.9525,
"step": 200
},
{
"epoch": 0.13958333333333334,
"grad_norm": 0.014045029878616333,
"learning_rate": 3.87646055099813e-05,
"loss": 0.9568,
"step": 201
},
{
"epoch": 0.14027777777777778,
"grad_norm": 0.015043736435472965,
"learning_rate": 3.874898455243955e-05,
"loss": 0.9852,
"step": 202
},
{
"epoch": 0.14097222222222222,
"grad_norm": 0.014233953319489956,
"learning_rate": 3.873326864238183e-05,
"loss": 1.0112,
"step": 203
},
{
"epoch": 0.14166666666666666,
"grad_norm": 0.014034410007297993,
"learning_rate": 3.871745785939993e-05,
"loss": 0.959,
"step": 204
},
{
"epoch": 0.1423611111111111,
"grad_norm": 0.013695859350264072,
"learning_rate": 3.870155228356611e-05,
"loss": 0.9657,
"step": 205
},
{
"epoch": 0.14305555555555555,
"grad_norm": 0.01533182617276907,
"learning_rate": 3.8685551995432713e-05,
"loss": 1.0051,
"step": 206
},
{
"epoch": 0.14375,
"grad_norm": 0.014852087013423443,
"learning_rate": 3.866945707603172e-05,
"loss": 1.0207,
"step": 207
},
{
"epoch": 0.14444444444444443,
"grad_norm": 0.014271554537117481,
"learning_rate": 3.8653267606874395e-05,
"loss": 1.0129,
"step": 208
},
{
"epoch": 0.1451388888888889,
"grad_norm": 0.013892865739762783,
"learning_rate": 3.863698366995079e-05,
"loss": 1.0039,
"step": 209
},
{
"epoch": 0.14583333333333334,
"grad_norm": 0.01436774991452694,
"learning_rate": 3.862060534772945e-05,
"loss": 0.9811,
"step": 210
},
{
"epoch": 0.14652777777777778,
"grad_norm": 0.014014553278684616,
"learning_rate": 3.8604132723156865e-05,
"loss": 0.9575,
"step": 211
},
{
"epoch": 0.14722222222222223,
"grad_norm": 0.012928716838359833,
"learning_rate": 3.858756587965714e-05,
"loss": 0.9725,
"step": 212
},
{
"epoch": 0.14791666666666667,
"grad_norm": 0.014141298830509186,
"learning_rate": 3.8570904901131544e-05,
"loss": 0.9553,
"step": 213
},
{
"epoch": 0.1486111111111111,
"grad_norm": 0.014526846818625927,
"learning_rate": 3.855414987195807e-05,
"loss": 0.9812,
"step": 214
},
{
"epoch": 0.14930555555555555,
"grad_norm": 0.012949762865900993,
"learning_rate": 3.853730087699103e-05,
"loss": 0.9676,
"step": 215
},
{
"epoch": 0.15,
"grad_norm": 0.013820677995681763,
"learning_rate": 3.852035800156062e-05,
"loss": 1.0086,
"step": 216
},
{
"epoch": 0.15069444444444444,
"grad_norm": 0.013385163620114326,
"learning_rate": 3.850332133147248e-05,
"loss": 0.9818,
"step": 217
},
{
"epoch": 0.15138888888888888,
"grad_norm": 0.012220976874232292,
"learning_rate": 3.848619095300726e-05,
"loss": 0.96,
"step": 218
},
{
"epoch": 0.15208333333333332,
"grad_norm": 0.01417449489235878,
"learning_rate": 3.8468966952920205e-05,
"loss": 1.0152,
"step": 219
},
{
"epoch": 0.1527777777777778,
"grad_norm": 0.013405581004917622,
"learning_rate": 3.845164941844068e-05,
"loss": 1.0077,
"step": 220
},
{
"epoch": 0.15347222222222223,
"grad_norm": 0.01379991602152586,
"learning_rate": 3.843423843727176e-05,
"loss": 0.9917,
"step": 221
},
{
"epoch": 0.15416666666666667,
"grad_norm": 0.014063477516174316,
"learning_rate": 3.8416734097589756e-05,
"loss": 0.9679,
"step": 222
},
{
"epoch": 0.15486111111111112,
"grad_norm": 0.012324227020144463,
"learning_rate": 3.83991364880438e-05,
"loss": 0.9574,
"step": 223
},
{
"epoch": 0.15555555555555556,
"grad_norm": 0.01431061141192913,
"learning_rate": 3.8381445697755365e-05,
"loss": 0.9967,
"step": 224
},
{
"epoch": 0.15625,
"grad_norm": 0.014298739843070507,
"learning_rate": 3.836366181631785e-05,
"loss": 0.9815,
"step": 225
},
{
"epoch": 0.15694444444444444,
"grad_norm": 0.012113712728023529,
"learning_rate": 3.8345784933796095e-05,
"loss": 0.9337,
"step": 226
},
{
"epoch": 0.15763888888888888,
"grad_norm": 0.015284246765077114,
"learning_rate": 3.832781514072593e-05,
"loss": 1.0051,
"step": 227
},
{
"epoch": 0.15833333333333333,
"grad_norm": 0.012948422692716122,
"learning_rate": 3.8309752528113725e-05,
"loss": 0.9554,
"step": 228
},
{
"epoch": 0.15902777777777777,
"grad_norm": 0.014634348452091217,
"learning_rate": 3.8291597187435926e-05,
"loss": 0.9814,
"step": 229
},
{
"epoch": 0.1597222222222222,
"grad_norm": 0.014931918121874332,
"learning_rate": 3.827334921063861e-05,
"loss": 1.0053,
"step": 230
},
{
"epoch": 0.16041666666666668,
"grad_norm": 0.012998196296393871,
"learning_rate": 3.825500869013697e-05,
"loss": 0.9781,
"step": 231
},
{
"epoch": 0.16111111111111112,
"grad_norm": 0.015014944598078728,
"learning_rate": 3.82365757188149e-05,
"loss": 1.014,
"step": 232
},
{
"epoch": 0.16180555555555556,
"grad_norm": 0.015584563836455345,
"learning_rate": 3.821805039002449e-05,
"loss": 0.9788,
"step": 233
},
{
"epoch": 0.1625,
"grad_norm": 0.013651632703840733,
"learning_rate": 3.8199432797585576e-05,
"loss": 0.9905,
"step": 234
},
{
"epoch": 0.16319444444444445,
"grad_norm": 0.015139803290367126,
"learning_rate": 3.8180723035785246e-05,
"loss": 1.0362,
"step": 235
},
{
"epoch": 0.1638888888888889,
"grad_norm": 0.014169261790812016,
"learning_rate": 3.816192119937738e-05,
"loss": 0.9688,
"step": 236
},
{
"epoch": 0.16458333333333333,
"grad_norm": 0.013432355597615242,
"learning_rate": 3.814302738358216e-05,
"loss": 0.955,
"step": 237
},
{
"epoch": 0.16527777777777777,
"grad_norm": 0.012756401672959328,
"learning_rate": 3.812404168408558e-05,
"loss": 0.9817,
"step": 238
},
{
"epoch": 0.16597222222222222,
"grad_norm": 0.012755703181028366,
"learning_rate": 3.810496419703898e-05,
"loss": 0.9785,
"step": 239
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.014174016192555428,
"learning_rate": 3.808579501905856e-05,
"loss": 0.9704,
"step": 240
},
{
"epoch": 0.1673611111111111,
"grad_norm": 0.013185903429985046,
"learning_rate": 3.806653424722484e-05,
"loss": 0.9805,
"step": 241
},
{
"epoch": 0.16805555555555557,
"grad_norm": 0.014006533659994602,
"learning_rate": 3.804718197908225e-05,
"loss": 0.9846,
"step": 242
},
{
"epoch": 0.16875,
"grad_norm": 0.01497966144233942,
"learning_rate": 3.802773831263859e-05,
"loss": 0.9697,
"step": 243
},
{
"epoch": 0.16944444444444445,
"grad_norm": 0.0131217110902071,
"learning_rate": 3.800820334636452e-05,
"loss": 1.0037,
"step": 244
},
{
"epoch": 0.1701388888888889,
"grad_norm": 0.012915054336190224,
"learning_rate": 3.798857717919308e-05,
"loss": 0.9489,
"step": 245
},
{
"epoch": 0.17083333333333334,
"grad_norm": 0.013334513641893864,
"learning_rate": 3.7968859910519215e-05,
"loss": 0.9891,
"step": 246
},
{
"epoch": 0.17152777777777778,
"grad_norm": 0.014228935353457928,
"learning_rate": 3.7949051640199216e-05,
"loss": 1.0013,
"step": 247
},
{
"epoch": 0.17222222222222222,
"grad_norm": 0.013772614300251007,
"learning_rate": 3.7929152468550245e-05,
"loss": 1.0042,
"step": 248
},
{
"epoch": 0.17291666666666666,
"grad_norm": 0.01312232669442892,
"learning_rate": 3.790916249634986e-05,
"loss": 0.9374,
"step": 249
},
{
"epoch": 0.1736111111111111,
"grad_norm": 0.014259965158998966,
"learning_rate": 3.788908182483542e-05,
"loss": 0.9432,
"step": 250
},
{
"epoch": 0.17430555555555555,
"grad_norm": 0.014950071461498737,
"learning_rate": 3.786891055570367e-05,
"loss": 0.9674,
"step": 251
},
{
"epoch": 0.175,
"grad_norm": 0.014557240530848503,
"learning_rate": 3.784864879111013e-05,
"loss": 0.9811,
"step": 252
},
{
"epoch": 0.17569444444444443,
"grad_norm": 0.011923140846192837,
"learning_rate": 3.782829663366868e-05,
"loss": 0.9773,
"step": 253
},
{
"epoch": 0.1763888888888889,
"grad_norm": 0.012792705558240414,
"learning_rate": 3.780785418645095e-05,
"loss": 0.9635,
"step": 254
},
{
"epoch": 0.17708333333333334,
"grad_norm": 0.01456011924892664,
"learning_rate": 3.7787321552985826e-05,
"loss": 0.9271,
"step": 255
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.014588495716452599,
"learning_rate": 3.776669883725897e-05,
"loss": 0.9818,
"step": 256
},
{
"epoch": 0.17847222222222223,
"grad_norm": 0.013889756053686142,
"learning_rate": 3.774598614371223e-05,
"loss": 0.9894,
"step": 257
},
{
"epoch": 0.17916666666666667,
"grad_norm": 0.014536023139953613,
"learning_rate": 3.7725183577243144e-05,
"loss": 0.9675,
"step": 258
},
{
"epoch": 0.1798611111111111,
"grad_norm": 0.01632644794881344,
"learning_rate": 3.7704291243204416e-05,
"loss": 1.0065,
"step": 259
},
{
"epoch": 0.18055555555555555,
"grad_norm": 0.013067888095974922,
"learning_rate": 3.7683309247403345e-05,
"loss": 1.0002,
"step": 260
},
{
"epoch": 0.18125,
"grad_norm": 0.01566268689930439,
"learning_rate": 3.766223769610134e-05,
"loss": 0.9552,
"step": 261
},
{
"epoch": 0.18194444444444444,
"grad_norm": 0.01552093680948019,
"learning_rate": 3.764107669601333e-05,
"loss": 1.0219,
"step": 262
},
{
"epoch": 0.18263888888888888,
"grad_norm": 0.013710713014006615,
"learning_rate": 3.761982635430729e-05,
"loss": 0.9559,
"step": 263
},
{
"epoch": 0.18333333333333332,
"grad_norm": 0.013827464543282986,
"learning_rate": 3.759848677860359e-05,
"loss": 0.9741,
"step": 264
},
{
"epoch": 0.1840277777777778,
"grad_norm": 0.01393877249211073,
"learning_rate": 3.7577058076974595e-05,
"loss": 0.9841,
"step": 265
},
{
"epoch": 0.18472222222222223,
"grad_norm": 0.014325949363410473,
"learning_rate": 3.755554035794398e-05,
"loss": 1.0068,
"step": 266
},
{
"epoch": 0.18541666666666667,
"grad_norm": 0.013421372510492802,
"learning_rate": 3.7533933730486276e-05,
"loss": 0.9409,
"step": 267
},
{
"epoch": 0.18611111111111112,
"grad_norm": 0.015207280404865742,
"learning_rate": 3.751223830402627e-05,
"loss": 1.0078,
"step": 268
},
{
"epoch": 0.18680555555555556,
"grad_norm": 0.015149595215916634,
"learning_rate": 3.749045418843844e-05,
"loss": 0.9989,
"step": 269
},
{
"epoch": 0.1875,
"grad_norm": 0.012723335064947605,
"learning_rate": 3.746858149404648e-05,
"loss": 0.9656,
"step": 270
},
{
"epoch": 0.18819444444444444,
"grad_norm": 0.01434493437409401,
"learning_rate": 3.744662033162262e-05,
"loss": 0.9725,
"step": 271
},
{
"epoch": 0.18888888888888888,
"grad_norm": 0.01342215109616518,
"learning_rate": 3.742457081238717e-05,
"loss": 0.9555,
"step": 272
},
{
"epoch": 0.18958333333333333,
"grad_norm": 0.012995844706892967,
"learning_rate": 3.740243304800791e-05,
"loss": 0.9775,
"step": 273
},
{
"epoch": 0.19027777777777777,
"grad_norm": 0.01189506333321333,
"learning_rate": 3.738020715059951e-05,
"loss": 0.9443,
"step": 274
},
{
"epoch": 0.1909722222222222,
"grad_norm": 0.013390977866947651,
"learning_rate": 3.7357893232722984e-05,
"loss": 0.9675,
"step": 275
},
{
"epoch": 0.19166666666666668,
"grad_norm": 0.01335330493748188,
"learning_rate": 3.733549140738514e-05,
"loss": 0.9596,
"step": 276
},
{
"epoch": 0.19236111111111112,
"grad_norm": 0.012211819179356098,
"learning_rate": 3.731300178803797e-05,
"loss": 0.9413,
"step": 277
},
{
"epoch": 0.19305555555555556,
"grad_norm": 0.013362012803554535,
"learning_rate": 3.7290424488578094e-05,
"loss": 0.9622,
"step": 278
},
{
"epoch": 0.19375,
"grad_norm": 0.013488608412444592,
"learning_rate": 3.726775962334617e-05,
"loss": 1.0322,
"step": 279
},
{
"epoch": 0.19444444444444445,
"grad_norm": 0.013000058010220528,
"learning_rate": 3.7245007307126346e-05,
"loss": 0.9966,
"step": 280
},
{
"epoch": 0.1951388888888889,
"grad_norm": 0.014398259110748768,
"learning_rate": 3.7222167655145636e-05,
"loss": 0.9775,
"step": 281
},
{
"epoch": 0.19583333333333333,
"grad_norm": 0.012063449248671532,
"learning_rate": 3.7199240783073365e-05,
"loss": 0.9369,
"step": 282
},
{
"epoch": 0.19652777777777777,
"grad_norm": 0.012797530740499496,
"learning_rate": 3.717622680702058e-05,
"loss": 0.9591,
"step": 283
},
{
"epoch": 0.19722222222222222,
"grad_norm": 0.012506108731031418,
"learning_rate": 3.7153125843539455e-05,
"loss": 0.9781,
"step": 284
},
{
"epoch": 0.19791666666666666,
"grad_norm": 0.013660159893333912,
"learning_rate": 3.712993800962271e-05,
"loss": 0.9994,
"step": 285
},
{
"epoch": 0.1986111111111111,
"grad_norm": 0.013593221083283424,
"learning_rate": 3.7106663422703004e-05,
"loss": 0.9945,
"step": 286
},
{
"epoch": 0.19930555555555557,
"grad_norm": 0.013533521443605423,
"learning_rate": 3.708330220065235e-05,
"loss": 1.0033,
"step": 287
},
{
"epoch": 0.2,
"grad_norm": 0.015442555770277977,
"learning_rate": 3.7059854461781514e-05,
"loss": 0.9829,
"step": 288
},
{
"epoch": 0.20069444444444445,
"grad_norm": 0.014286418445408344,
"learning_rate": 3.7036320324839444e-05,
"loss": 0.9674,
"step": 289
},
{
"epoch": 0.2013888888888889,
"grad_norm": 0.013105183839797974,
"learning_rate": 3.70126999090126e-05,
"loss": 0.9562,
"step": 290
},
{
"epoch": 0.20208333333333334,
"grad_norm": 0.012748828157782555,
"learning_rate": 3.698899333392442e-05,
"loss": 0.9385,
"step": 291
},
{
"epoch": 0.20277777777777778,
"grad_norm": 0.013904962688684464,
"learning_rate": 3.696520071963469e-05,
"loss": 0.9769,
"step": 292
},
{
"epoch": 0.20347222222222222,
"grad_norm": 0.013301237486302853,
"learning_rate": 3.6941322186638924e-05,
"loss": 0.9582,
"step": 293
},
{
"epoch": 0.20416666666666666,
"grad_norm": 0.013818389736115932,
"learning_rate": 3.691735785586777e-05,
"loss": 0.9992,
"step": 294
},
{
"epoch": 0.2048611111111111,
"grad_norm": 0.018090086057782173,
"learning_rate": 3.6893307848686376e-05,
"loss": 1.0265,
"step": 295
},
{
"epoch": 0.20555555555555555,
"grad_norm": 0.011723886243999004,
"learning_rate": 3.686917228689382e-05,
"loss": 0.9447,
"step": 296
},
{
"epoch": 0.20625,
"grad_norm": 0.013388545252382755,
"learning_rate": 3.684495129272242e-05,
"loss": 0.9725,
"step": 297
},
{
"epoch": 0.20694444444444443,
"grad_norm": 0.013546598143875599,
"learning_rate": 3.682064498883721e-05,
"loss": 0.9867,
"step": 298
},
{
"epoch": 0.2076388888888889,
"grad_norm": 0.01278660912066698,
"learning_rate": 3.679625349833523e-05,
"loss": 0.9533,
"step": 299
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.013434414751827717,
"learning_rate": 3.677177694474495e-05,
"loss": 0.9587,
"step": 300
},
{
"epoch": 0.20902777777777778,
"grad_norm": 0.013443008065223694,
"learning_rate": 3.674721545202563e-05,
"loss": 0.9758,
"step": 301
},
{
"epoch": 0.20972222222222223,
"grad_norm": 0.012249941006302834,
"learning_rate": 3.672256914456671e-05,
"loss": 0.9391,
"step": 302
},
{
"epoch": 0.21041666666666667,
"grad_norm": 0.013929427601397038,
"learning_rate": 3.669783814718716e-05,
"loss": 0.966,
"step": 303
},
{
"epoch": 0.2111111111111111,
"grad_norm": 0.020106054842472076,
"learning_rate": 3.667302258513484e-05,
"loss": 0.9916,
"step": 304
},
{
"epoch": 0.21180555555555555,
"grad_norm": 0.017338188365101814,
"learning_rate": 3.6648122584085907e-05,
"loss": 0.9905,
"step": 305
},
{
"epoch": 0.2125,
"grad_norm": 0.016092827543616295,
"learning_rate": 3.662313827014412e-05,
"loss": 0.9422,
"step": 306
},
{
"epoch": 0.21319444444444444,
"grad_norm": 0.014228146523237228,
"learning_rate": 3.659806976984026e-05,
"loss": 1.0033,
"step": 307
},
{
"epoch": 0.21388888888888888,
"grad_norm": 0.015558873303234577,
"learning_rate": 3.657291721013146e-05,
"loss": 0.9775,
"step": 308
},
{
"epoch": 0.21458333333333332,
"grad_norm": 0.016239987686276436,
"learning_rate": 3.6547680718400546e-05,
"loss": 0.9863,
"step": 309
},
{
"epoch": 0.2152777777777778,
"grad_norm": 0.014311805367469788,
"learning_rate": 3.652236042245542e-05,
"loss": 1.0078,
"step": 310
},
{
"epoch": 0.21597222222222223,
"grad_norm": 0.014693599194288254,
"learning_rate": 3.64969564505284e-05,
"loss": 0.9743,
"step": 311
},
{
"epoch": 0.21666666666666667,
"grad_norm": 0.014121411368250847,
"learning_rate": 3.647146893127559e-05,
"loss": 0.9712,
"step": 312
},
{
"epoch": 0.21736111111111112,
"grad_norm": 0.013323220424354076,
"learning_rate": 3.644589799377618e-05,
"loss": 0.9912,
"step": 313
},
{
"epoch": 0.21805555555555556,
"grad_norm": 0.014195873402059078,
"learning_rate": 3.642024376753186e-05,
"loss": 0.9733,
"step": 314
},
{
"epoch": 0.21875,
"grad_norm": 0.01346920058131218,
"learning_rate": 3.639450638246611e-05,
"loss": 0.9562,
"step": 315
},
{
"epoch": 0.21944444444444444,
"grad_norm": 0.013232593424618244,
"learning_rate": 3.636868596892353e-05,
"loss": 0.9724,
"step": 316
},
{
"epoch": 0.22013888888888888,
"grad_norm": 0.01408706046640873,
"learning_rate": 3.634278265766928e-05,
"loss": 1.009,
"step": 317
},
{
"epoch": 0.22083333333333333,
"grad_norm": 0.013279229402542114,
"learning_rate": 3.631679657988829e-05,
"loss": 0.9757,
"step": 318
},
{
"epoch": 0.22152777777777777,
"grad_norm": 0.015104196034371853,
"learning_rate": 3.6290727867184676e-05,
"loss": 1.001,
"step": 319
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.012325095012784004,
"learning_rate": 3.6264576651581036e-05,
"loss": 0.9758,
"step": 320
},
{
"epoch": 0.22291666666666668,
"grad_norm": 0.013523522764444351,
"learning_rate": 3.623834306551782e-05,
"loss": 0.9482,
"step": 321
},
{
"epoch": 0.22361111111111112,
"grad_norm": 0.01278225053101778,
"learning_rate": 3.6212027241852625e-05,
"loss": 0.9631,
"step": 322
},
{
"epoch": 0.22430555555555556,
"grad_norm": 0.012586407363414764,
"learning_rate": 3.618562931385952e-05,
"loss": 1.0226,
"step": 323
},
{
"epoch": 0.225,
"grad_norm": 0.012502876110374928,
"learning_rate": 3.6159149415228403e-05,
"loss": 0.9229,
"step": 324
},
{
"epoch": 0.22569444444444445,
"grad_norm": 0.012781070545315742,
"learning_rate": 3.613258768006429e-05,
"loss": 0.9532,
"step": 325
},
{
"epoch": 0.2263888888888889,
"grad_norm": 0.013412564061582088,
"learning_rate": 3.6105944242886674e-05,
"loss": 1.0037,
"step": 326
},
{
"epoch": 0.22708333333333333,
"grad_norm": 0.013194269500672817,
"learning_rate": 3.607921923862878e-05,
"loss": 0.9241,
"step": 327
},
{
"epoch": 0.22777777777777777,
"grad_norm": 0.013584061525762081,
"learning_rate": 3.605241280263696e-05,
"loss": 0.9482,
"step": 328
},
{
"epoch": 0.22847222222222222,
"grad_norm": 0.013534624129533768,
"learning_rate": 3.6025525070669955e-05,
"loss": 0.9653,
"step": 329
},
{
"epoch": 0.22916666666666666,
"grad_norm": 0.015827780589461327,
"learning_rate": 3.599855617889822e-05,
"loss": 0.9511,
"step": 330
},
{
"epoch": 0.2298611111111111,
"grad_norm": 0.013115919195115566,
"learning_rate": 3.5971506263903234e-05,
"loss": 0.9516,
"step": 331
},
{
"epoch": 0.23055555555555557,
"grad_norm": 0.015311370603740215,
"learning_rate": 3.594437546267682e-05,
"loss": 0.9543,
"step": 332
},
{
"epoch": 0.23125,
"grad_norm": 0.014090127311646938,
"learning_rate": 3.591716391262044e-05,
"loss": 0.9914,
"step": 333
},
{
"epoch": 0.23194444444444445,
"grad_norm": 0.013163258321583271,
"learning_rate": 3.588987175154449e-05,
"loss": 0.9722,
"step": 334
},
{
"epoch": 0.2326388888888889,
"grad_norm": 0.012093394063413143,
"learning_rate": 3.586249911766763e-05,
"loss": 0.9758,
"step": 335
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.014174525626003742,
"learning_rate": 3.583504614961605e-05,
"loss": 0.9683,
"step": 336
},
{
"epoch": 0.23402777777777778,
"grad_norm": 0.014316610060632229,
"learning_rate": 3.58075129864228e-05,
"loss": 0.9814,
"step": 337
},
{
"epoch": 0.23472222222222222,
"grad_norm": 0.013283569365739822,
"learning_rate": 3.5779899767527064e-05,
"loss": 1.0012,
"step": 338
},
{
"epoch": 0.23541666666666666,
"grad_norm": 0.013545655645430088,
"learning_rate": 3.575220663277346e-05,
"loss": 0.9426,
"step": 339
},
{
"epoch": 0.2361111111111111,
"grad_norm": 0.01365567371249199,
"learning_rate": 3.572443372241134e-05,
"loss": 0.957,
"step": 340
},
{
"epoch": 0.23680555555555555,
"grad_norm": 0.012532095424830914,
"learning_rate": 3.569658117709406e-05,
"loss": 0.9439,
"step": 341
},
{
"epoch": 0.2375,
"grad_norm": 0.012797664850950241,
"learning_rate": 3.5668649137878275e-05,
"loss": 0.9805,
"step": 342
},
{
"epoch": 0.23819444444444443,
"grad_norm": 0.013005426153540611,
"learning_rate": 3.5640637746223253e-05,
"loss": 0.9515,
"step": 343
},
{
"epoch": 0.2388888888888889,
"grad_norm": 0.013586447574198246,
"learning_rate": 3.561254714399013e-05,
"loss": 0.9731,
"step": 344
},
{
"epoch": 0.23958333333333334,
"grad_norm": 0.013848516158759594,
"learning_rate": 3.5584377473441187e-05,
"loss": 0.9731,
"step": 345
},
{
"epoch": 0.24027777777777778,
"grad_norm": 0.014170484617352486,
"learning_rate": 3.5556128877239125e-05,
"loss": 0.9817,
"step": 346
},
{
"epoch": 0.24097222222222223,
"grad_norm": 0.014107048511505127,
"learning_rate": 3.552780149844639e-05,
"loss": 0.9733,
"step": 347
},
{
"epoch": 0.24166666666666667,
"grad_norm": 0.013129886239767075,
"learning_rate": 3.5499395480524394e-05,
"loss": 0.9803,
"step": 348
},
{
"epoch": 0.2423611111111111,
"grad_norm": 0.014618804678320885,
"learning_rate": 3.5470910967332815e-05,
"loss": 0.9941,
"step": 349
},
{
"epoch": 0.24305555555555555,
"grad_norm": 0.013775044120848179,
"learning_rate": 3.544234810312886e-05,
"loss": 0.9978,
"step": 350
},
{
"epoch": 0.24375,
"grad_norm": 0.013130244798958302,
"learning_rate": 3.541370703256654e-05,
"loss": 0.9978,
"step": 351
},
{
"epoch": 0.24444444444444444,
"grad_norm": 0.013368207029998302,
"learning_rate": 3.538498790069594e-05,
"loss": 0.9875,
"step": 352
},
{
"epoch": 0.24513888888888888,
"grad_norm": 0.014766732230782509,
"learning_rate": 3.5356190852962474e-05,
"loss": 0.9446,
"step": 353
},
{
"epoch": 0.24583333333333332,
"grad_norm": 0.012403284199535847,
"learning_rate": 3.5327316035206145e-05,
"loss": 0.9305,
"step": 354
},
{
"epoch": 0.2465277777777778,
"grad_norm": 0.013556853868067265,
"learning_rate": 3.5298363593660835e-05,
"loss": 1.0034,
"step": 355
},
{
"epoch": 0.24722222222222223,
"grad_norm": 0.013330447487533092,
"learning_rate": 3.526933367495353e-05,
"loss": 0.9721,
"step": 356
},
{
"epoch": 0.24791666666666667,
"grad_norm": 0.01214715838432312,
"learning_rate": 3.5240226426103596e-05,
"loss": 0.9462,
"step": 357
},
{
"epoch": 0.24861111111111112,
"grad_norm": 0.013268118724226952,
"learning_rate": 3.521104199452203e-05,
"loss": 0.9796,
"step": 358
},
{
"epoch": 0.24930555555555556,
"grad_norm": 0.012496302835643291,
"learning_rate": 3.5181780528010715e-05,
"loss": 0.9427,
"step": 359
},
{
"epoch": 0.25,
"grad_norm": 0.013434799388051033,
"learning_rate": 3.515244217476166e-05,
"loss": 0.9519,
"step": 360
},
{
"epoch": 0.25069444444444444,
"grad_norm": 0.012152577750384808,
"learning_rate": 3.5123027083356285e-05,
"loss": 0.9707,
"step": 361
},
{
"epoch": 0.2513888888888889,
"grad_norm": 0.013102750293910503,
"learning_rate": 3.509353540276462e-05,
"loss": 0.9594,
"step": 362
},
{
"epoch": 0.2520833333333333,
"grad_norm": 0.01402961928397417,
"learning_rate": 3.506396728234459e-05,
"loss": 0.9629,
"step": 363
},
{
"epoch": 0.25277777777777777,
"grad_norm": 0.013971925713121891,
"learning_rate": 3.503432287184121e-05,
"loss": 0.9559,
"step": 364
},
{
"epoch": 0.2534722222222222,
"grad_norm": 0.013591865077614784,
"learning_rate": 3.500460232138591e-05,
"loss": 0.9919,
"step": 365
},
{
"epoch": 0.25416666666666665,
"grad_norm": 0.013570788316428661,
"learning_rate": 3.497480578149569e-05,
"loss": 0.9537,
"step": 366
},
{
"epoch": 0.2548611111111111,
"grad_norm": 0.013885971158742905,
"learning_rate": 3.4944933403072387e-05,
"loss": 0.9678,
"step": 367
},
{
"epoch": 0.25555555555555554,
"grad_norm": 0.013096191920340061,
"learning_rate": 3.491498533740193e-05,
"loss": 0.9505,
"step": 368
},
{
"epoch": 0.25625,
"grad_norm": 0.013115983456373215,
"learning_rate": 3.488496173615358e-05,
"loss": 0.9467,
"step": 369
},
{
"epoch": 0.2569444444444444,
"grad_norm": 0.01301599945873022,
"learning_rate": 3.485486275137909e-05,
"loss": 0.9501,
"step": 370
},
{
"epoch": 0.25763888888888886,
"grad_norm": 0.012697902508080006,
"learning_rate": 3.482468853551202e-05,
"loss": 0.9749,
"step": 371
},
{
"epoch": 0.25833333333333336,
"grad_norm": 0.01309168990701437,
"learning_rate": 3.479443924136693e-05,
"loss": 0.9791,
"step": 372
},
{
"epoch": 0.2590277777777778,
"grad_norm": 0.012592338025569916,
"learning_rate": 3.47641150221386e-05,
"loss": 0.9726,
"step": 373
},
{
"epoch": 0.25972222222222224,
"grad_norm": 0.012716379016637802,
"learning_rate": 3.473371603140125e-05,
"loss": 0.9423,
"step": 374
},
{
"epoch": 0.2604166666666667,
"grad_norm": 0.01352920476347208,
"learning_rate": 3.47032424231078e-05,
"loss": 0.9829,
"step": 375
},
{
"epoch": 0.2611111111111111,
"grad_norm": 0.012491214089095592,
"learning_rate": 3.4672694351589046e-05,
"loss": 0.9633,
"step": 376
},
{
"epoch": 0.26180555555555557,
"grad_norm": 0.01319537591189146,
"learning_rate": 3.46420719715529e-05,
"loss": 0.9221,
"step": 377
},
{
"epoch": 0.2625,
"grad_norm": 0.013664872385561466,
"learning_rate": 3.46113754380836e-05,
"loss": 0.9621,
"step": 378
},
{
"epoch": 0.26319444444444445,
"grad_norm": 0.01348687894642353,
"learning_rate": 3.4580604906640936e-05,
"loss": 0.9634,
"step": 379
},
{
"epoch": 0.2638888888888889,
"grad_norm": 0.011980963870882988,
"learning_rate": 3.454976053305943e-05,
"loss": 0.9288,
"step": 380
},
{
"epoch": 0.26458333333333334,
"grad_norm": 0.01295421365648508,
"learning_rate": 3.4518842473547614e-05,
"loss": 0.9216,
"step": 381
},
{
"epoch": 0.2652777777777778,
"grad_norm": 0.014002040959894657,
"learning_rate": 3.4487850884687155e-05,
"loss": 0.9781,
"step": 382
},
{
"epoch": 0.2659722222222222,
"grad_norm": 0.012479842640459538,
"learning_rate": 3.445678592343212e-05,
"loss": 0.9305,
"step": 383
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.013758037239313126,
"learning_rate": 3.442564774710816e-05,
"loss": 0.9732,
"step": 384
},
{
"epoch": 0.2673611111111111,
"grad_norm": 0.01285907905548811,
"learning_rate": 3.439443651341172e-05,
"loss": 0.9494,
"step": 385
},
{
"epoch": 0.26805555555555555,
"grad_norm": 0.014414280652999878,
"learning_rate": 3.436315238040924e-05,
"loss": 0.9495,
"step": 386
},
{
"epoch": 0.26875,
"grad_norm": 0.013502138666808605,
"learning_rate": 3.4331795506536336e-05,
"loss": 0.924,
"step": 387
},
{
"epoch": 0.26944444444444443,
"grad_norm": 0.01338834036141634,
"learning_rate": 3.430036605059704e-05,
"loss": 0.9467,
"step": 388
},
{
"epoch": 0.2701388888888889,
"grad_norm": 0.014044429175555706,
"learning_rate": 3.426886417176294e-05,
"loss": 0.9976,
"step": 389
},
{
"epoch": 0.2708333333333333,
"grad_norm": 0.013052555732429028,
"learning_rate": 3.423729002957244e-05,
"loss": 0.9586,
"step": 390
},
{
"epoch": 0.27152777777777776,
"grad_norm": 0.012639213353395462,
"learning_rate": 3.4205643783929875e-05,
"loss": 0.9498,
"step": 391
},
{
"epoch": 0.2722222222222222,
"grad_norm": 0.012982922606170177,
"learning_rate": 3.417392559510475e-05,
"loss": 0.9844,
"step": 392
},
{
"epoch": 0.27291666666666664,
"grad_norm": 0.012655354104936123,
"learning_rate": 3.4142135623730954e-05,
"loss": 0.948,
"step": 393
},
{
"epoch": 0.27361111111111114,
"grad_norm": 0.013443528674542904,
"learning_rate": 3.411027403080587e-05,
"loss": 0.9682,
"step": 394
},
{
"epoch": 0.2743055555555556,
"grad_norm": 0.013121607713401318,
"learning_rate": 3.407834097768962e-05,
"loss": 1.0032,
"step": 395
},
{
"epoch": 0.275,
"grad_norm": 0.011974303051829338,
"learning_rate": 3.4046336626104235e-05,
"loss": 0.9615,
"step": 396
},
{
"epoch": 0.27569444444444446,
"grad_norm": 0.013200311921536922,
"learning_rate": 3.401426113813282e-05,
"loss": 0.9612,
"step": 397
},
{
"epoch": 0.2763888888888889,
"grad_norm": 0.011763915419578552,
"learning_rate": 3.398211467621875e-05,
"loss": 0.9523,
"step": 398
},
{
"epoch": 0.27708333333333335,
"grad_norm": 0.012212570756673813,
"learning_rate": 3.394989740316484e-05,
"loss": 0.9482,
"step": 399
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.012741866521537304,
"learning_rate": 3.391760948213251e-05,
"loss": 0.9787,
"step": 400
},
{
"epoch": 0.27847222222222223,
"grad_norm": 0.013035683892667294,
"learning_rate": 3.388525107664099e-05,
"loss": 0.9398,
"step": 401
},
{
"epoch": 0.2791666666666667,
"grad_norm": 0.013569427654147148,
"learning_rate": 3.3852822350566455e-05,
"loss": 0.9492,
"step": 402
},
{
"epoch": 0.2798611111111111,
"grad_norm": 0.012654770165681839,
"learning_rate": 3.382032346814123e-05,
"loss": 0.9762,
"step": 403
},
{
"epoch": 0.28055555555555556,
"grad_norm": 0.013426728546619415,
"learning_rate": 3.378775459395292e-05,
"loss": 0.9348,
"step": 404
},
{
"epoch": 0.28125,
"grad_norm": 0.01277459692209959,
"learning_rate": 3.3755115892943616e-05,
"loss": 0.9698,
"step": 405
},
{
"epoch": 0.28194444444444444,
"grad_norm": 0.013655406422913074,
"learning_rate": 3.372240753040902e-05,
"loss": 0.9792,
"step": 406
},
{
"epoch": 0.2826388888888889,
"grad_norm": 0.01355676632374525,
"learning_rate": 3.368962967199765e-05,
"loss": 0.9584,
"step": 407
},
{
"epoch": 0.2833333333333333,
"grad_norm": 0.013229291886091232,
"learning_rate": 3.3656782483709966e-05,
"loss": 0.9581,
"step": 408
},
{
"epoch": 0.28402777777777777,
"grad_norm": 0.01382211409509182,
"learning_rate": 3.3623866131897554e-05,
"loss": 0.9953,
"step": 409
},
{
"epoch": 0.2847222222222222,
"grad_norm": 0.012580876238644123,
"learning_rate": 3.3590880783262245e-05,
"loss": 0.9491,
"step": 410
},
{
"epoch": 0.28541666666666665,
"grad_norm": 0.014332804828882217,
"learning_rate": 3.3557826604855335e-05,
"loss": 0.9777,
"step": 411
},
{
"epoch": 0.2861111111111111,
"grad_norm": 0.014996136538684368,
"learning_rate": 3.3524703764076684e-05,
"loss": 0.949,
"step": 412
},
{
"epoch": 0.28680555555555554,
"grad_norm": 0.014224053360521793,
"learning_rate": 3.3491512428673877e-05,
"loss": 0.983,
"step": 413
},
{
"epoch": 0.2875,
"grad_norm": 0.014253957197070122,
"learning_rate": 3.345825276674139e-05,
"loss": 0.955,
"step": 414
},
{
"epoch": 0.2881944444444444,
"grad_norm": 0.0160871259868145,
"learning_rate": 3.342492494671976e-05,
"loss": 0.9732,
"step": 415
},
{
"epoch": 0.28888888888888886,
"grad_norm": 0.012912842445075512,
"learning_rate": 3.339152913739466e-05,
"loss": 0.9695,
"step": 416
},
{
"epoch": 0.28958333333333336,
"grad_norm": 0.013580222614109516,
"learning_rate": 3.335806550789611e-05,
"loss": 0.9974,
"step": 417
},
{
"epoch": 0.2902777777777778,
"grad_norm": 0.013716046698391438,
"learning_rate": 3.332453422769762e-05,
"loss": 0.9577,
"step": 418
},
{
"epoch": 0.29097222222222224,
"grad_norm": 0.013095886446535587,
"learning_rate": 3.329093546661526e-05,
"loss": 0.9624,
"step": 419
},
{
"epoch": 0.2916666666666667,
"grad_norm": 0.013227414339780807,
"learning_rate": 3.3257269394806894e-05,
"loss": 0.9823,
"step": 420
},
{
"epoch": 0.2923611111111111,
"grad_norm": 0.012513620778918266,
"learning_rate": 3.322353618277126e-05,
"loss": 0.9446,
"step": 421
},
{
"epoch": 0.29305555555555557,
"grad_norm": 0.01236687507480383,
"learning_rate": 3.3189736001347125e-05,
"loss": 0.9639,
"step": 422
},
{
"epoch": 0.29375,
"grad_norm": 0.013223248533904552,
"learning_rate": 3.315586902171241e-05,
"loss": 0.9436,
"step": 423
},
{
"epoch": 0.29444444444444445,
"grad_norm": 0.01373869925737381,
"learning_rate": 3.3121935415383325e-05,
"loss": 0.9751,
"step": 424
},
{
"epoch": 0.2951388888888889,
"grad_norm": 0.012828570790588856,
"learning_rate": 3.308793535421353e-05,
"loss": 0.98,
"step": 425
},
{
"epoch": 0.29583333333333334,
"grad_norm": 0.011885512620210648,
"learning_rate": 3.305386901039321e-05,
"loss": 0.9612,
"step": 426
},
{
"epoch": 0.2965277777777778,
"grad_norm": 0.011671421118080616,
"learning_rate": 3.301973655644825e-05,
"loss": 0.9269,
"step": 427
},
{
"epoch": 0.2972222222222222,
"grad_norm": 0.013502037152647972,
"learning_rate": 3.298553816523934e-05,
"loss": 0.9404,
"step": 428
},
{
"epoch": 0.29791666666666666,
"grad_norm": 0.013123046606779099,
"learning_rate": 3.2951274009961094e-05,
"loss": 0.9812,
"step": 429
},
{
"epoch": 0.2986111111111111,
"grad_norm": 0.012360905297100544,
"learning_rate": 3.29169442641412e-05,
"loss": 0.9355,
"step": 430
},
{
"epoch": 0.29930555555555555,
"grad_norm": 0.01276017352938652,
"learning_rate": 3.288254910163951e-05,
"loss": 0.9689,
"step": 431
},
{
"epoch": 0.3,
"grad_norm": 0.013017778284847736,
"learning_rate": 3.284808869664716e-05,
"loss": 0.9716,
"step": 432
},
{
"epoch": 0.30069444444444443,
"grad_norm": 0.011824820190668106,
"learning_rate": 3.281356322368575e-05,
"loss": 0.9633,
"step": 433
},
{
"epoch": 0.3013888888888889,
"grad_norm": 0.012792828492820263,
"learning_rate": 3.277897285760635e-05,
"loss": 0.9841,
"step": 434
},
{
"epoch": 0.3020833333333333,
"grad_norm": 0.012392980977892876,
"learning_rate": 3.2744317773588696e-05,
"loss": 0.9172,
"step": 435
},
{
"epoch": 0.30277777777777776,
"grad_norm": 0.013595100492238998,
"learning_rate": 3.270959814714032e-05,
"loss": 0.9827,
"step": 436
},
{
"epoch": 0.3034722222222222,
"grad_norm": 0.01175049040466547,
"learning_rate": 3.267481415409557e-05,
"loss": 0.9782,
"step": 437
},
{
"epoch": 0.30416666666666664,
"grad_norm": 0.013577048666775227,
"learning_rate": 3.26399659706148e-05,
"loss": 1.0069,
"step": 438
},
{
"epoch": 0.30486111111111114,
"grad_norm": 0.012351465411484241,
"learning_rate": 3.260505377318344e-05,
"loss": 0.9585,
"step": 439
},
{
"epoch": 0.3055555555555556,
"grad_norm": 0.013745117001235485,
"learning_rate": 3.257007773861113e-05,
"loss": 0.9942,
"step": 440
},
{
"epoch": 0.30625,
"grad_norm": 0.013106070458889008,
"learning_rate": 3.253503804403079e-05,
"loss": 0.9419,
"step": 441
},
{
"epoch": 0.30694444444444446,
"grad_norm": 0.013440150767564774,
"learning_rate": 3.249993486689774e-05,
"loss": 0.9428,
"step": 442
},
{
"epoch": 0.3076388888888889,
"grad_norm": 0.014590987004339695,
"learning_rate": 3.246476838498881e-05,
"loss": 0.9874,
"step": 443
},
{
"epoch": 0.30833333333333335,
"grad_norm": 0.013203262351453304,
"learning_rate": 3.242953877640142e-05,
"loss": 0.947,
"step": 444
},
{
"epoch": 0.3090277777777778,
"grad_norm": 0.013087602332234383,
"learning_rate": 3.2394246219552724e-05,
"loss": 0.9897,
"step": 445
},
{
"epoch": 0.30972222222222223,
"grad_norm": 0.012205411680042744,
"learning_rate": 3.2358890893178617e-05,
"loss": 0.9489,
"step": 446
},
{
"epoch": 0.3104166666666667,
"grad_norm": 0.012712490744888783,
"learning_rate": 3.2323472976332926e-05,
"loss": 0.9363,
"step": 447
},
{
"epoch": 0.3111111111111111,
"grad_norm": 0.012987968511879444,
"learning_rate": 3.228799264838645e-05,
"loss": 0.9568,
"step": 448
},
{
"epoch": 0.31180555555555556,
"grad_norm": 0.014164082705974579,
"learning_rate": 3.225245008902606e-05,
"loss": 0.9515,
"step": 449
},
{
"epoch": 0.3125,
"grad_norm": 0.014080497436225414,
"learning_rate": 3.221684547825379e-05,
"loss": 0.9715,
"step": 450
},
{
"epoch": 0.31319444444444444,
"grad_norm": 0.01266495417803526,
"learning_rate": 3.218117899638594e-05,
"loss": 0.9961,
"step": 451
},
{
"epoch": 0.3138888888888889,
"grad_norm": 0.012783526442945004,
"learning_rate": 3.214545082405213e-05,
"loss": 0.9667,
"step": 452
},
{
"epoch": 0.3145833333333333,
"grad_norm": 0.012183685787022114,
"learning_rate": 3.210966114219444e-05,
"loss": 0.9561,
"step": 453
},
{
"epoch": 0.31527777777777777,
"grad_norm": 0.014331048354506493,
"learning_rate": 3.2073810132066414e-05,
"loss": 1.0032,
"step": 454
},
{
"epoch": 0.3159722222222222,
"grad_norm": 0.012980937026441097,
"learning_rate": 3.2037897975232216e-05,
"loss": 0.9567,
"step": 455
},
{
"epoch": 0.31666666666666665,
"grad_norm": 0.01176460087299347,
"learning_rate": 3.200192485356569e-05,
"loss": 0.9396,
"step": 456
},
{
"epoch": 0.3173611111111111,
"grad_norm": 0.012090741656720638,
"learning_rate": 3.1965890949249405e-05,
"loss": 0.9494,
"step": 457
},
{
"epoch": 0.31805555555555554,
"grad_norm": 0.013988793827593327,
"learning_rate": 3.192979644477378e-05,
"loss": 1.0022,
"step": 458
},
{
"epoch": 0.31875,
"grad_norm": 0.012773574329912663,
"learning_rate": 3.189364152293612e-05,
"loss": 1.022,
"step": 459
},
{
"epoch": 0.3194444444444444,
"grad_norm": 0.013159438967704773,
"learning_rate": 3.185742636683972e-05,
"loss": 0.9754,
"step": 460
},
{
"epoch": 0.32013888888888886,
"grad_norm": 0.015010225586593151,
"learning_rate": 3.1821151159892924e-05,
"loss": 0.9525,
"step": 461
},
{
"epoch": 0.32083333333333336,
"grad_norm": 0.012587225064635277,
"learning_rate": 3.1784816085808196e-05,
"loss": 0.9651,
"step": 462
},
{
"epoch": 0.3215277777777778,
"grad_norm": 0.01335595827549696,
"learning_rate": 3.17484213286012e-05,
"loss": 0.9691,
"step": 463
},
{
"epoch": 0.32222222222222224,
"grad_norm": 0.012998990714550018,
"learning_rate": 3.171196707258984e-05,
"loss": 0.936,
"step": 464
},
{
"epoch": 0.3229166666666667,
"grad_norm": 0.013085835613310337,
"learning_rate": 3.167545350239336e-05,
"loss": 0.9294,
"step": 465
},
{
"epoch": 0.3236111111111111,
"grad_norm": 0.012886827811598778,
"learning_rate": 3.16388808029314e-05,
"loss": 0.9685,
"step": 466
},
{
"epoch": 0.32430555555555557,
"grad_norm": 0.014822276309132576,
"learning_rate": 3.1602249159423054e-05,
"loss": 0.9545,
"step": 467
},
{
"epoch": 0.325,
"grad_norm": 0.013084998354315758,
"learning_rate": 3.1565558757385914e-05,
"loss": 0.9638,
"step": 468
},
{
"epoch": 0.32569444444444445,
"grad_norm": 0.012661305256187916,
"learning_rate": 3.152880978263517e-05,
"loss": 0.9033,
"step": 469
},
{
"epoch": 0.3263888888888889,
"grad_norm": 0.0137302465736866,
"learning_rate": 3.149200242128263e-05,
"loss": 0.9576,
"step": 470
},
{
"epoch": 0.32708333333333334,
"grad_norm": 0.01353361364454031,
"learning_rate": 3.145513685973583e-05,
"loss": 0.9555,
"step": 471
},
{
"epoch": 0.3277777777777778,
"grad_norm": 0.01424756832420826,
"learning_rate": 3.1418213284697e-05,
"loss": 0.9583,
"step": 472
},
{
"epoch": 0.3284722222222222,
"grad_norm": 0.013587539084255695,
"learning_rate": 3.138123188316224e-05,
"loss": 0.9665,
"step": 473
},
{
"epoch": 0.32916666666666666,
"grad_norm": 0.013397585600614548,
"learning_rate": 3.1344192842420435e-05,
"loss": 0.951,
"step": 474
},
{
"epoch": 0.3298611111111111,
"grad_norm": 0.012381638400256634,
"learning_rate": 3.130709635005245e-05,
"loss": 0.96,
"step": 475
},
{
"epoch": 0.33055555555555555,
"grad_norm": 0.014178499579429626,
"learning_rate": 3.1269942593930055e-05,
"loss": 0.9891,
"step": 476
},
{
"epoch": 0.33125,
"grad_norm": 0.013224642723798752,
"learning_rate": 3.123273176221506e-05,
"loss": 0.9719,
"step": 477
},
{
"epoch": 0.33194444444444443,
"grad_norm": 0.012466056272387505,
"learning_rate": 3.119546404335831e-05,
"loss": 0.968,
"step": 478
},
{
"epoch": 0.3326388888888889,
"grad_norm": 0.01483081839978695,
"learning_rate": 3.115813962609874e-05,
"loss": 0.9601,
"step": 479
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.013933224603533745,
"learning_rate": 3.112075869946247e-05,
"loss": 0.9802,
"step": 480
},
{
"epoch": 0.33402777777777776,
"grad_norm": 0.013764635659754276,
"learning_rate": 3.108332145276177e-05,
"loss": 0.9646,
"step": 481
},
{
"epoch": 0.3347222222222222,
"grad_norm": 0.014757770113646984,
"learning_rate": 3.104582807559414e-05,
"loss": 0.9592,
"step": 482
},
{
"epoch": 0.33541666666666664,
"grad_norm": 0.012920022010803223,
"learning_rate": 3.100827875784138e-05,
"loss": 0.9674,
"step": 483
},
{
"epoch": 0.33611111111111114,
"grad_norm": 0.013609576970338821,
"learning_rate": 3.097067368966855e-05,
"loss": 0.9456,
"step": 484
},
{
"epoch": 0.3368055555555556,
"grad_norm": 0.013191001489758492,
"learning_rate": 3.093301306152308e-05,
"loss": 0.9568,
"step": 485
},
{
"epoch": 0.3375,
"grad_norm": 0.013639998622238636,
"learning_rate": 3.089529706413378e-05,
"loss": 0.9746,
"step": 486
},
{
"epoch": 0.33819444444444446,
"grad_norm": 0.013085294514894485,
"learning_rate": 3.085752588850986e-05,
"loss": 0.9391,
"step": 487
},
{
"epoch": 0.3388888888888889,
"grad_norm": 0.01391910295933485,
"learning_rate": 3.081969972593999e-05,
"loss": 0.9885,
"step": 488
},
{
"epoch": 0.33958333333333335,
"grad_norm": 0.013547900132834911,
"learning_rate": 3.0781818767991295e-05,
"loss": 0.9481,
"step": 489
},
{
"epoch": 0.3402777777777778,
"grad_norm": 0.012131288647651672,
"learning_rate": 3.074388320650843e-05,
"loss": 0.9667,
"step": 490
},
{
"epoch": 0.34097222222222223,
"grad_norm": 0.013132079504430294,
"learning_rate": 3.070589323361257e-05,
"loss": 0.9915,
"step": 491
},
{
"epoch": 0.3416666666666667,
"grad_norm": 0.014800147153437138,
"learning_rate": 3.0667849041700454e-05,
"loss": 0.9737,
"step": 492
},
{
"epoch": 0.3423611111111111,
"grad_norm": 0.01190117746591568,
"learning_rate": 3.062975082344341e-05,
"loss": 0.9413,
"step": 493
},
{
"epoch": 0.34305555555555556,
"grad_norm": 0.011400418356060982,
"learning_rate": 3.059159877178638e-05,
"loss": 0.9453,
"step": 494
},
{
"epoch": 0.34375,
"grad_norm": 0.011752789840102196,
"learning_rate": 3.055339307994693e-05,
"loss": 0.9832,
"step": 495
},
{
"epoch": 0.34444444444444444,
"grad_norm": 0.01378630194813013,
"learning_rate": 3.0515133941414294e-05,
"loss": 0.9243,
"step": 496
},
{
"epoch": 0.3451388888888889,
"grad_norm": 0.012208312749862671,
"learning_rate": 3.0476821549948376e-05,
"loss": 0.9798,
"step": 497
},
{
"epoch": 0.3458333333333333,
"grad_norm": 0.01983562856912613,
"learning_rate": 3.0438456099578775e-05,
"loss": 0.9998,
"step": 498
},
{
"epoch": 0.34652777777777777,
"grad_norm": 0.013442217372357845,
"learning_rate": 3.0400037784603805e-05,
"loss": 0.9304,
"step": 499
},
{
"epoch": 0.3472222222222222,
"grad_norm": 0.014221866615116596,
"learning_rate": 3.0361566799589498e-05,
"loss": 0.9938,
"step": 500
}
],
"logging_steps": 1.0,
"max_steps": 1440,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1598246092800000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}