farmery's picture
Training in progress, step 833, checkpoint
d1e5024 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9991004497751125,
"eval_steps": 500,
"global_step": 833,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011994002998500749,
"grad_norm": 0.11149752140045166,
"learning_rate": 1.0000000000000002e-06,
"loss": 10.3757,
"step": 1
},
{
"epoch": 0.0023988005997001498,
"grad_norm": 0.1320822536945343,
"learning_rate": 2.0000000000000003e-06,
"loss": 10.3748,
"step": 2
},
{
"epoch": 0.003598200899550225,
"grad_norm": 0.13625332713127136,
"learning_rate": 3e-06,
"loss": 10.3734,
"step": 3
},
{
"epoch": 0.0047976011994002995,
"grad_norm": 0.1434432715177536,
"learning_rate": 4.000000000000001e-06,
"loss": 10.3739,
"step": 4
},
{
"epoch": 0.005997001499250375,
"grad_norm": 0.14035488665103912,
"learning_rate": 5e-06,
"loss": 10.3765,
"step": 5
},
{
"epoch": 0.00719640179910045,
"grad_norm": 0.14915668964385986,
"learning_rate": 6e-06,
"loss": 10.3743,
"step": 6
},
{
"epoch": 0.008395802098950524,
"grad_norm": 0.15720613300800323,
"learning_rate": 7.000000000000001e-06,
"loss": 10.3724,
"step": 7
},
{
"epoch": 0.009595202398800599,
"grad_norm": 0.15728804469108582,
"learning_rate": 8.000000000000001e-06,
"loss": 10.3741,
"step": 8
},
{
"epoch": 0.010794602698650674,
"grad_norm": 0.16915467381477356,
"learning_rate": 9e-06,
"loss": 10.3754,
"step": 9
},
{
"epoch": 0.01199400299850075,
"grad_norm": 0.17764562368392944,
"learning_rate": 1e-05,
"loss": 10.3721,
"step": 10
},
{
"epoch": 0.013193403298350824,
"grad_norm": 0.19067725539207458,
"learning_rate": 1.1000000000000001e-05,
"loss": 10.3733,
"step": 11
},
{
"epoch": 0.0143928035982009,
"grad_norm": 0.20686227083206177,
"learning_rate": 1.2e-05,
"loss": 10.3718,
"step": 12
},
{
"epoch": 0.015592203898050975,
"grad_norm": 0.13864944875240326,
"learning_rate": 1.3000000000000001e-05,
"loss": 10.3757,
"step": 13
},
{
"epoch": 0.016791604197901048,
"grad_norm": 0.13187511265277863,
"learning_rate": 1.4000000000000001e-05,
"loss": 10.3746,
"step": 14
},
{
"epoch": 0.017991004497751123,
"grad_norm": 0.13840411603450775,
"learning_rate": 1.5e-05,
"loss": 10.3745,
"step": 15
},
{
"epoch": 0.019190404797601198,
"grad_norm": 0.13771533966064453,
"learning_rate": 1.6000000000000003e-05,
"loss": 10.3757,
"step": 16
},
{
"epoch": 0.020389805097451273,
"grad_norm": 0.144022136926651,
"learning_rate": 1.7000000000000003e-05,
"loss": 10.3742,
"step": 17
},
{
"epoch": 0.02158920539730135,
"grad_norm": 0.14401449263095856,
"learning_rate": 1.8e-05,
"loss": 10.3742,
"step": 18
},
{
"epoch": 0.022788605697151423,
"grad_norm": 0.15697641670703888,
"learning_rate": 1.9e-05,
"loss": 10.3749,
"step": 19
},
{
"epoch": 0.0239880059970015,
"grad_norm": 0.15877871215343475,
"learning_rate": 2e-05,
"loss": 10.3725,
"step": 20
},
{
"epoch": 0.025187406296851574,
"grad_norm": 0.16563433408737183,
"learning_rate": 2.1e-05,
"loss": 10.3743,
"step": 21
},
{
"epoch": 0.02638680659670165,
"grad_norm": 0.17025835812091827,
"learning_rate": 2.2000000000000003e-05,
"loss": 10.3738,
"step": 22
},
{
"epoch": 0.027586206896551724,
"grad_norm": 0.17753835022449493,
"learning_rate": 2.3000000000000003e-05,
"loss": 10.3732,
"step": 23
},
{
"epoch": 0.0287856071964018,
"grad_norm": 0.19433385133743286,
"learning_rate": 2.4e-05,
"loss": 10.373,
"step": 24
},
{
"epoch": 0.029985007496251874,
"grad_norm": 0.2346523553133011,
"learning_rate": 2.5e-05,
"loss": 10.3702,
"step": 25
},
{
"epoch": 0.03118440779610195,
"grad_norm": 0.11483809351921082,
"learning_rate": 2.6000000000000002e-05,
"loss": 10.3731,
"step": 26
},
{
"epoch": 0.032383808095952024,
"grad_norm": 0.12950515747070312,
"learning_rate": 2.7000000000000002e-05,
"loss": 10.3734,
"step": 27
},
{
"epoch": 0.033583208395802096,
"grad_norm": 0.13586033880710602,
"learning_rate": 2.8000000000000003e-05,
"loss": 10.3725,
"step": 28
},
{
"epoch": 0.034782608695652174,
"grad_norm": 0.14205914735794067,
"learning_rate": 2.9e-05,
"loss": 10.375,
"step": 29
},
{
"epoch": 0.035982008995502246,
"grad_norm": 0.14829206466674805,
"learning_rate": 3e-05,
"loss": 10.3727,
"step": 30
},
{
"epoch": 0.037181409295352325,
"grad_norm": 0.15150436758995056,
"learning_rate": 3.1e-05,
"loss": 10.3721,
"step": 31
},
{
"epoch": 0.038380809595202396,
"grad_norm": 0.1555749624967575,
"learning_rate": 3.2000000000000005e-05,
"loss": 10.3742,
"step": 32
},
{
"epoch": 0.039580209895052475,
"grad_norm": 0.1620868444442749,
"learning_rate": 3.3e-05,
"loss": 10.3729,
"step": 33
},
{
"epoch": 0.040779610194902546,
"grad_norm": 0.1725643426179886,
"learning_rate": 3.4000000000000007e-05,
"loss": 10.3718,
"step": 34
},
{
"epoch": 0.041979010494752625,
"grad_norm": 0.18715962767601013,
"learning_rate": 3.5e-05,
"loss": 10.371,
"step": 35
},
{
"epoch": 0.0431784107946027,
"grad_norm": 0.19183875620365143,
"learning_rate": 3.6e-05,
"loss": 10.3718,
"step": 36
},
{
"epoch": 0.044377811094452775,
"grad_norm": 0.21521849930286407,
"learning_rate": 3.7e-05,
"loss": 10.3719,
"step": 37
},
{
"epoch": 0.04557721139430285,
"grad_norm": 0.1310717910528183,
"learning_rate": 3.8e-05,
"loss": 10.3731,
"step": 38
},
{
"epoch": 0.046776611694152925,
"grad_norm": 0.12645111978054047,
"learning_rate": 3.9000000000000006e-05,
"loss": 10.3728,
"step": 39
},
{
"epoch": 0.047976011994003,
"grad_norm": 0.1375029981136322,
"learning_rate": 4e-05,
"loss": 10.3719,
"step": 40
},
{
"epoch": 0.049175412293853075,
"grad_norm": 0.1415010392665863,
"learning_rate": 4.1e-05,
"loss": 10.3723,
"step": 41
},
{
"epoch": 0.05037481259370315,
"grad_norm": 0.1474965512752533,
"learning_rate": 4.2e-05,
"loss": 10.3708,
"step": 42
},
{
"epoch": 0.051574212893553226,
"grad_norm": 0.1504737138748169,
"learning_rate": 4.3e-05,
"loss": 10.3718,
"step": 43
},
{
"epoch": 0.0527736131934033,
"grad_norm": 0.15808707475662231,
"learning_rate": 4.4000000000000006e-05,
"loss": 10.3712,
"step": 44
},
{
"epoch": 0.053973013493253376,
"grad_norm": 0.16332747042179108,
"learning_rate": 4.5e-05,
"loss": 10.3695,
"step": 45
},
{
"epoch": 0.05517241379310345,
"grad_norm": 0.17212961614131927,
"learning_rate": 4.600000000000001e-05,
"loss": 10.3692,
"step": 46
},
{
"epoch": 0.056371814092953526,
"grad_norm": 0.17262108623981476,
"learning_rate": 4.7e-05,
"loss": 10.3705,
"step": 47
},
{
"epoch": 0.0575712143928036,
"grad_norm": 0.1849043071269989,
"learning_rate": 4.8e-05,
"loss": 10.3671,
"step": 48
},
{
"epoch": 0.058770614692653676,
"grad_norm": 0.19811047613620758,
"learning_rate": 4.9e-05,
"loss": 10.3671,
"step": 49
},
{
"epoch": 0.05997001499250375,
"grad_norm": 0.2461751103401184,
"learning_rate": 5e-05,
"loss": 10.3667,
"step": 50
},
{
"epoch": 0.061169415292353826,
"grad_norm": 0.12041845172643661,
"learning_rate": 5.1000000000000006e-05,
"loss": 10.371,
"step": 51
},
{
"epoch": 0.0623688155922039,
"grad_norm": 0.12933555245399475,
"learning_rate": 5.2000000000000004e-05,
"loss": 10.3694,
"step": 52
},
{
"epoch": 0.06356821589205397,
"grad_norm": 0.14095033705234528,
"learning_rate": 5.300000000000001e-05,
"loss": 10.3693,
"step": 53
},
{
"epoch": 0.06476761619190405,
"grad_norm": 0.14537349343299866,
"learning_rate": 5.4000000000000005e-05,
"loss": 10.3671,
"step": 54
},
{
"epoch": 0.06596701649175413,
"grad_norm": 0.1486896276473999,
"learning_rate": 5.500000000000001e-05,
"loss": 10.3671,
"step": 55
},
{
"epoch": 0.06716641679160419,
"grad_norm": 0.15299569070339203,
"learning_rate": 5.6000000000000006e-05,
"loss": 10.3668,
"step": 56
},
{
"epoch": 0.06836581709145427,
"grad_norm": 0.16295485198497772,
"learning_rate": 5.6999999999999996e-05,
"loss": 10.3653,
"step": 57
},
{
"epoch": 0.06956521739130435,
"grad_norm": 0.16358605027198792,
"learning_rate": 5.8e-05,
"loss": 10.3661,
"step": 58
},
{
"epoch": 0.07076461769115443,
"grad_norm": 0.17602834105491638,
"learning_rate": 5.9e-05,
"loss": 10.3633,
"step": 59
},
{
"epoch": 0.07196401799100449,
"grad_norm": 0.18307778239250183,
"learning_rate": 6e-05,
"loss": 10.3636,
"step": 60
},
{
"epoch": 0.07316341829085457,
"grad_norm": 0.19053678214550018,
"learning_rate": 6.1e-05,
"loss": 10.3633,
"step": 61
},
{
"epoch": 0.07436281859070465,
"grad_norm": 0.20003724098205566,
"learning_rate": 6.2e-05,
"loss": 10.3622,
"step": 62
},
{
"epoch": 0.07556221889055473,
"grad_norm": 0.15672719478607178,
"learning_rate": 6.3e-05,
"loss": 10.3644,
"step": 63
},
{
"epoch": 0.07676161919040479,
"grad_norm": 0.14309260249137878,
"learning_rate": 6.400000000000001e-05,
"loss": 10.3649,
"step": 64
},
{
"epoch": 0.07796101949025487,
"grad_norm": 0.1525091975927353,
"learning_rate": 6.500000000000001e-05,
"loss": 10.3646,
"step": 65
},
{
"epoch": 0.07916041979010495,
"grad_norm": 0.15806975960731506,
"learning_rate": 6.6e-05,
"loss": 10.3623,
"step": 66
},
{
"epoch": 0.08035982008995503,
"grad_norm": 0.16560155153274536,
"learning_rate": 6.7e-05,
"loss": 10.3609,
"step": 67
},
{
"epoch": 0.08155922038980509,
"grad_norm": 0.16659671068191528,
"learning_rate": 6.800000000000001e-05,
"loss": 10.3601,
"step": 68
},
{
"epoch": 0.08275862068965517,
"grad_norm": 0.18391086161136627,
"learning_rate": 6.9e-05,
"loss": 10.3617,
"step": 69
},
{
"epoch": 0.08395802098950525,
"grad_norm": 0.1868380904197693,
"learning_rate": 7e-05,
"loss": 10.3576,
"step": 70
},
{
"epoch": 0.08515742128935533,
"grad_norm": 0.20636723935604095,
"learning_rate": 7.1e-05,
"loss": 10.357,
"step": 71
},
{
"epoch": 0.0863568215892054,
"grad_norm": 0.2090313583612442,
"learning_rate": 7.2e-05,
"loss": 10.3535,
"step": 72
},
{
"epoch": 0.08755622188905547,
"grad_norm": 0.23220857977867126,
"learning_rate": 7.3e-05,
"loss": 10.355,
"step": 73
},
{
"epoch": 0.08875562218890555,
"grad_norm": 0.23836413025856018,
"learning_rate": 7.4e-05,
"loss": 10.3502,
"step": 74
},
{
"epoch": 0.08995502248875563,
"grad_norm": 0.2619498670101166,
"learning_rate": 7.500000000000001e-05,
"loss": 10.3485,
"step": 75
},
{
"epoch": 0.0911544227886057,
"grad_norm": 0.20814213156700134,
"learning_rate": 7.6e-05,
"loss": 10.3572,
"step": 76
},
{
"epoch": 0.09235382308845577,
"grad_norm": 0.2424175888299942,
"learning_rate": 7.7e-05,
"loss": 10.3524,
"step": 77
},
{
"epoch": 0.09355322338830585,
"grad_norm": 0.2402586191892624,
"learning_rate": 7.800000000000001e-05,
"loss": 10.3501,
"step": 78
},
{
"epoch": 0.09475262368815592,
"grad_norm": 0.27135393023490906,
"learning_rate": 7.900000000000001e-05,
"loss": 10.3486,
"step": 79
},
{
"epoch": 0.095952023988006,
"grad_norm": 0.279787540435791,
"learning_rate": 8e-05,
"loss": 10.3447,
"step": 80
},
{
"epoch": 0.09715142428785607,
"grad_norm": 0.28797608613967896,
"learning_rate": 8.1e-05,
"loss": 10.3438,
"step": 81
},
{
"epoch": 0.09835082458770615,
"grad_norm": 0.3241998851299286,
"learning_rate": 8.2e-05,
"loss": 10.3392,
"step": 82
},
{
"epoch": 0.09955022488755622,
"grad_norm": 0.3485376238822937,
"learning_rate": 8.3e-05,
"loss": 10.3364,
"step": 83
},
{
"epoch": 0.1007496251874063,
"grad_norm": 0.3528934419155121,
"learning_rate": 8.4e-05,
"loss": 10.3339,
"step": 84
},
{
"epoch": 0.10194902548725637,
"grad_norm": 0.361213743686676,
"learning_rate": 8.5e-05,
"loss": 10.3271,
"step": 85
},
{
"epoch": 0.10314842578710645,
"grad_norm": 0.3762340843677521,
"learning_rate": 8.6e-05,
"loss": 10.3242,
"step": 86
},
{
"epoch": 0.10434782608695652,
"grad_norm": 0.3962249159812927,
"learning_rate": 8.7e-05,
"loss": 10.3198,
"step": 87
},
{
"epoch": 0.1055472263868066,
"grad_norm": 0.4154474139213562,
"learning_rate": 8.800000000000001e-05,
"loss": 10.3241,
"step": 88
},
{
"epoch": 0.10674662668665667,
"grad_norm": 0.42189449071884155,
"learning_rate": 8.900000000000001e-05,
"loss": 10.319,
"step": 89
},
{
"epoch": 0.10794602698650675,
"grad_norm": 0.3983931541442871,
"learning_rate": 9e-05,
"loss": 10.3142,
"step": 90
},
{
"epoch": 0.10914542728635682,
"grad_norm": 0.39982685446739197,
"learning_rate": 9.1e-05,
"loss": 10.3076,
"step": 91
},
{
"epoch": 0.1103448275862069,
"grad_norm": 0.3935539424419403,
"learning_rate": 9.200000000000001e-05,
"loss": 10.3047,
"step": 92
},
{
"epoch": 0.11154422788605697,
"grad_norm": 0.3751447796821594,
"learning_rate": 9.300000000000001e-05,
"loss": 10.2953,
"step": 93
},
{
"epoch": 0.11274362818590705,
"grad_norm": 0.3766322135925293,
"learning_rate": 9.4e-05,
"loss": 10.2932,
"step": 94
},
{
"epoch": 0.11394302848575712,
"grad_norm": 0.3571270704269409,
"learning_rate": 9.5e-05,
"loss": 10.2875,
"step": 95
},
{
"epoch": 0.1151424287856072,
"grad_norm": 0.34838995337486267,
"learning_rate": 9.6e-05,
"loss": 10.2812,
"step": 96
},
{
"epoch": 0.11634182908545727,
"grad_norm": 0.33183571696281433,
"learning_rate": 9.7e-05,
"loss": 10.276,
"step": 97
},
{
"epoch": 0.11754122938530735,
"grad_norm": 0.3224335312843323,
"learning_rate": 9.8e-05,
"loss": 10.2705,
"step": 98
},
{
"epoch": 0.11874062968515742,
"grad_norm": 0.33488717675209045,
"learning_rate": 9.900000000000001e-05,
"loss": 10.2681,
"step": 99
},
{
"epoch": 0.1199400299850075,
"grad_norm": 0.3330170810222626,
"learning_rate": 0.0001,
"loss": 10.2595,
"step": 100
},
{
"epoch": 0.12113943028485757,
"grad_norm": 0.29305723309516907,
"learning_rate": 9.999954076906038e-05,
"loss": 10.2593,
"step": 101
},
{
"epoch": 0.12233883058470765,
"grad_norm": 0.2937624454498291,
"learning_rate": 9.999816308467719e-05,
"loss": 10.2547,
"step": 102
},
{
"epoch": 0.12353823088455772,
"grad_norm": 0.29269203543663025,
"learning_rate": 9.999586697215748e-05,
"loss": 10.2482,
"step": 103
},
{
"epoch": 0.1247376311844078,
"grad_norm": 0.2853996157646179,
"learning_rate": 9.999265247367908e-05,
"loss": 10.2453,
"step": 104
},
{
"epoch": 0.12593703148425786,
"grad_norm": 0.28077051043510437,
"learning_rate": 9.998851964828986e-05,
"loss": 10.2412,
"step": 105
},
{
"epoch": 0.12713643178410794,
"grad_norm": 0.27728626132011414,
"learning_rate": 9.99834685719067e-05,
"loss": 10.2357,
"step": 106
},
{
"epoch": 0.12833583208395802,
"grad_norm": 0.28931453824043274,
"learning_rate": 9.997749933731398e-05,
"loss": 10.2308,
"step": 107
},
{
"epoch": 0.1295352323838081,
"grad_norm": 0.3001117408275604,
"learning_rate": 9.997061205416203e-05,
"loss": 10.2248,
"step": 108
},
{
"epoch": 0.13073463268365818,
"grad_norm": 0.33052486181259155,
"learning_rate": 9.996280684896495e-05,
"loss": 10.2211,
"step": 109
},
{
"epoch": 0.13193403298350825,
"grad_norm": 0.3081456124782562,
"learning_rate": 9.995408386509846e-05,
"loss": 10.2144,
"step": 110
},
{
"epoch": 0.13313343328335833,
"grad_norm": 0.29855111241340637,
"learning_rate": 9.994444326279708e-05,
"loss": 10.2106,
"step": 111
},
{
"epoch": 0.13433283358320838,
"grad_norm": 0.31099098920822144,
"learning_rate": 9.993388521915134e-05,
"loss": 10.208,
"step": 112
},
{
"epoch": 0.13553223388305846,
"grad_norm": 0.2797197103500366,
"learning_rate": 9.992240992810444e-05,
"loss": 10.2032,
"step": 113
},
{
"epoch": 0.13673163418290854,
"grad_norm": 0.2696700692176819,
"learning_rate": 9.991001760044875e-05,
"loss": 10.1995,
"step": 114
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.2778705954551697,
"learning_rate": 9.989670846382188e-05,
"loss": 10.1948,
"step": 115
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.28072160482406616,
"learning_rate": 9.988248276270248e-05,
"loss": 10.1918,
"step": 116
},
{
"epoch": 0.14032983508245878,
"grad_norm": 0.2873566150665283,
"learning_rate": 9.98673407584059e-05,
"loss": 10.1859,
"step": 117
},
{
"epoch": 0.14152923538230885,
"grad_norm": 0.2754334807395935,
"learning_rate": 9.985128272907918e-05,
"loss": 10.1841,
"step": 118
},
{
"epoch": 0.14272863568215893,
"grad_norm": 0.27839502692222595,
"learning_rate": 9.983430896969605e-05,
"loss": 10.1797,
"step": 119
},
{
"epoch": 0.14392803598200898,
"grad_norm": 0.28016844391822815,
"learning_rate": 9.981641979205158e-05,
"loss": 10.1762,
"step": 120
},
{
"epoch": 0.14512743628185906,
"grad_norm": 0.2875809669494629,
"learning_rate": 9.979761552475628e-05,
"loss": 10.169,
"step": 121
},
{
"epoch": 0.14632683658170914,
"grad_norm": 0.29734012484550476,
"learning_rate": 9.977789651323023e-05,
"loss": 10.1667,
"step": 122
},
{
"epoch": 0.14752623688155922,
"grad_norm": 0.34327661991119385,
"learning_rate": 9.975726311969664e-05,
"loss": 10.1617,
"step": 123
},
{
"epoch": 0.1487256371814093,
"grad_norm": 0.32941052317619324,
"learning_rate": 9.973571572317519e-05,
"loss": 10.1619,
"step": 124
},
{
"epoch": 0.14992503748125938,
"grad_norm": 0.3153591752052307,
"learning_rate": 9.971325471947517e-05,
"loss": 10.1544,
"step": 125
},
{
"epoch": 0.15112443778110946,
"grad_norm": 0.26798173785209656,
"learning_rate": 9.968988052118804e-05,
"loss": 10.1517,
"step": 126
},
{
"epoch": 0.15232383808095953,
"grad_norm": 0.2733558714389801,
"learning_rate": 9.966559355768005e-05,
"loss": 10.1469,
"step": 127
},
{
"epoch": 0.15352323838080958,
"grad_norm": 0.275776207447052,
"learning_rate": 9.964039427508418e-05,
"loss": 10.1425,
"step": 128
},
{
"epoch": 0.15472263868065966,
"grad_norm": 0.2864780128002167,
"learning_rate": 9.961428313629203e-05,
"loss": 10.1389,
"step": 129
},
{
"epoch": 0.15592203898050974,
"grad_norm": 0.2879246473312378,
"learning_rate": 9.958726062094534e-05,
"loss": 10.1327,
"step": 130
},
{
"epoch": 0.15712143928035982,
"grad_norm": 0.28718408942222595,
"learning_rate": 9.955932722542708e-05,
"loss": 10.1328,
"step": 131
},
{
"epoch": 0.1583208395802099,
"grad_norm": 0.2976157069206238,
"learning_rate": 9.953048346285245e-05,
"loss": 10.1255,
"step": 132
},
{
"epoch": 0.15952023988005998,
"grad_norm": 0.2953495681285858,
"learning_rate": 9.950072986305939e-05,
"loss": 10.1214,
"step": 133
},
{
"epoch": 0.16071964017991006,
"grad_norm": 0.2883056700229645,
"learning_rate": 9.947006697259882e-05,
"loss": 10.1202,
"step": 134
},
{
"epoch": 0.1619190404797601,
"grad_norm": 0.3668458163738251,
"learning_rate": 9.943849535472467e-05,
"loss": 10.1184,
"step": 135
},
{
"epoch": 0.16311844077961019,
"grad_norm": 0.3660871982574463,
"learning_rate": 9.940601558938348e-05,
"loss": 10.1143,
"step": 136
},
{
"epoch": 0.16431784107946026,
"grad_norm": 0.32822495698928833,
"learning_rate": 9.937262827320379e-05,
"loss": 10.1094,
"step": 137
},
{
"epoch": 0.16551724137931034,
"grad_norm": 0.2712533175945282,
"learning_rate": 9.933833401948513e-05,
"loss": 10.1083,
"step": 138
},
{
"epoch": 0.16671664167916042,
"grad_norm": 0.27984705567359924,
"learning_rate": 9.930313345818682e-05,
"loss": 10.1001,
"step": 139
},
{
"epoch": 0.1679160419790105,
"grad_norm": 0.2825195789337158,
"learning_rate": 9.92670272359163e-05,
"loss": 10.0952,
"step": 140
},
{
"epoch": 0.16911544227886058,
"grad_norm": 0.27770835161209106,
"learning_rate": 9.923001601591738e-05,
"loss": 10.0949,
"step": 141
},
{
"epoch": 0.17031484257871066,
"grad_norm": 0.2809567451477051,
"learning_rate": 9.919210047805792e-05,
"loss": 10.0913,
"step": 142
},
{
"epoch": 0.1715142428785607,
"grad_norm": 0.2936723530292511,
"learning_rate": 9.915328131881745e-05,
"loss": 10.0867,
"step": 143
},
{
"epoch": 0.1727136431784108,
"grad_norm": 0.29460567235946655,
"learning_rate": 9.911355925127433e-05,
"loss": 10.0855,
"step": 144
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.2977669835090637,
"learning_rate": 9.907293500509268e-05,
"loss": 10.08,
"step": 145
},
{
"epoch": 0.17511244377811094,
"grad_norm": 0.3061806559562683,
"learning_rate": 9.903140932650891e-05,
"loss": 10.0707,
"step": 146
},
{
"epoch": 0.17631184407796102,
"grad_norm": 0.2858673632144928,
"learning_rate": 9.898898297831807e-05,
"loss": 10.072,
"step": 147
},
{
"epoch": 0.1775112443778111,
"grad_norm": 0.3454626202583313,
"learning_rate": 9.894565673985985e-05,
"loss": 10.071,
"step": 148
},
{
"epoch": 0.17871064467766118,
"grad_norm": 0.5128405690193176,
"learning_rate": 9.890143140700419e-05,
"loss": 10.0703,
"step": 149
},
{
"epoch": 0.17991004497751126,
"grad_norm": 0.5272448658943176,
"learning_rate": 9.885630779213677e-05,
"loss": 10.0602,
"step": 150
},
{
"epoch": 0.1811094452773613,
"grad_norm": 0.27561256289482117,
"learning_rate": 9.881028672414396e-05,
"loss": 10.0577,
"step": 151
},
{
"epoch": 0.1823088455772114,
"grad_norm": 0.2745487093925476,
"learning_rate": 9.876336904839772e-05,
"loss": 10.0557,
"step": 152
},
{
"epoch": 0.18350824587706147,
"grad_norm": 0.2808258831501007,
"learning_rate": 9.871555562673995e-05,
"loss": 10.05,
"step": 153
},
{
"epoch": 0.18470764617691154,
"grad_norm": 0.28502652049064636,
"learning_rate": 9.866684733746679e-05,
"loss": 10.0468,
"step": 154
},
{
"epoch": 0.18590704647676162,
"grad_norm": 0.28735095262527466,
"learning_rate": 9.861724507531233e-05,
"loss": 10.0436,
"step": 155
},
{
"epoch": 0.1871064467766117,
"grad_norm": 0.2824231684207916,
"learning_rate": 9.856674975143236e-05,
"loss": 10.0452,
"step": 156
},
{
"epoch": 0.18830584707646178,
"grad_norm": 0.2854205369949341,
"learning_rate": 9.851536229338747e-05,
"loss": 10.0425,
"step": 157
},
{
"epoch": 0.18950524737631183,
"grad_norm": 0.3004373610019684,
"learning_rate": 9.846308364512606e-05,
"loss": 10.0368,
"step": 158
},
{
"epoch": 0.1907046476761619,
"grad_norm": 0.3045557141304016,
"learning_rate": 9.840991476696706e-05,
"loss": 10.0328,
"step": 159
},
{
"epoch": 0.191904047976012,
"grad_norm": 0.2962295413017273,
"learning_rate": 9.835585663558221e-05,
"loss": 10.0301,
"step": 160
},
{
"epoch": 0.19310344827586207,
"grad_norm": 0.31199753284454346,
"learning_rate": 9.830091024397818e-05,
"loss": 10.0286,
"step": 161
},
{
"epoch": 0.19430284857571214,
"grad_norm": 0.33795246481895447,
"learning_rate": 9.82450766014783e-05,
"loss": 10.0224,
"step": 162
},
{
"epoch": 0.19550224887556222,
"grad_norm": 0.28454065322875977,
"learning_rate": 9.818835673370401e-05,
"loss": 10.0172,
"step": 163
},
{
"epoch": 0.1967016491754123,
"grad_norm": 0.2783207595348358,
"learning_rate": 9.813075168255601e-05,
"loss": 10.0135,
"step": 164
},
{
"epoch": 0.19790104947526238,
"grad_norm": 0.2809619605541229,
"learning_rate": 9.807226250619521e-05,
"loss": 10.0112,
"step": 165
},
{
"epoch": 0.19910044977511243,
"grad_norm": 0.2793205976486206,
"learning_rate": 9.801289027902316e-05,
"loss": 10.0068,
"step": 166
},
{
"epoch": 0.2002998500749625,
"grad_norm": 0.2848748564720154,
"learning_rate": 9.795263609166243e-05,
"loss": 10.0044,
"step": 167
},
{
"epoch": 0.2014992503748126,
"grad_norm": 0.3925122618675232,
"learning_rate": 9.789150105093647e-05,
"loss": 10.0025,
"step": 168
},
{
"epoch": 0.20269865067466267,
"grad_norm": 0.48036912083625793,
"learning_rate": 9.78294862798494e-05,
"loss": 9.9991,
"step": 169
},
{
"epoch": 0.20389805097451275,
"grad_norm": 0.32086822390556335,
"learning_rate": 9.776659291756528e-05,
"loss": 9.9968,
"step": 170
},
{
"epoch": 0.20509745127436282,
"grad_norm": 0.28707683086395264,
"learning_rate": 9.770282211938721e-05,
"loss": 9.9895,
"step": 171
},
{
"epoch": 0.2062968515742129,
"grad_norm": 0.2987452745437622,
"learning_rate": 9.763817505673613e-05,
"loss": 9.9897,
"step": 172
},
{
"epoch": 0.20749625187406298,
"grad_norm": 0.3029066324234009,
"learning_rate": 9.75726529171293e-05,
"loss": 9.9879,
"step": 173
},
{
"epoch": 0.20869565217391303,
"grad_norm": 0.321458637714386,
"learning_rate": 9.750625690415848e-05,
"loss": 9.9815,
"step": 174
},
{
"epoch": 0.2098950524737631,
"grad_norm": 0.35623157024383545,
"learning_rate": 9.74389882374678e-05,
"loss": 9.9831,
"step": 175
},
{
"epoch": 0.2110944527736132,
"grad_norm": 0.27146583795547485,
"learning_rate": 9.737084815273137e-05,
"loss": 9.9741,
"step": 176
},
{
"epoch": 0.21229385307346327,
"grad_norm": 0.2866266071796417,
"learning_rate": 9.730183790163062e-05,
"loss": 9.9692,
"step": 177
},
{
"epoch": 0.21349325337331335,
"grad_norm": 0.28268909454345703,
"learning_rate": 9.72319587518312e-05,
"loss": 9.9681,
"step": 178
},
{
"epoch": 0.21469265367316342,
"grad_norm": 0.2824539244174957,
"learning_rate": 9.716121198695986e-05,
"loss": 9.9671,
"step": 179
},
{
"epoch": 0.2158920539730135,
"grad_norm": 0.2851243317127228,
"learning_rate": 9.708959890658073e-05,
"loss": 9.9606,
"step": 180
},
{
"epoch": 0.21709145427286355,
"grad_norm": 0.28162598609924316,
"learning_rate": 9.701712082617149e-05,
"loss": 9.9617,
"step": 181
},
{
"epoch": 0.21829085457271363,
"grad_norm": 0.28222355246543884,
"learning_rate": 9.69437790770992e-05,
"loss": 9.9595,
"step": 182
},
{
"epoch": 0.2194902548725637,
"grad_norm": 0.32921475172042847,
"learning_rate": 9.68695750065959e-05,
"loss": 9.9553,
"step": 183
},
{
"epoch": 0.2206896551724138,
"grad_norm": 0.4057653248310089,
"learning_rate": 9.679450997773378e-05,
"loss": 9.9576,
"step": 184
},
{
"epoch": 0.22188905547226387,
"grad_norm": 0.37460216879844666,
"learning_rate": 9.67185853694002e-05,
"loss": 9.9495,
"step": 185
},
{
"epoch": 0.22308845577211395,
"grad_norm": 0.28542500734329224,
"learning_rate": 9.66418025762723e-05,
"loss": 9.9535,
"step": 186
},
{
"epoch": 0.22428785607196403,
"grad_norm": 0.3168298304080963,
"learning_rate": 9.656416300879148e-05,
"loss": 9.9461,
"step": 187
},
{
"epoch": 0.2254872563718141,
"grad_norm": 0.2682456076145172,
"learning_rate": 9.648566809313738e-05,
"loss": 9.941,
"step": 188
},
{
"epoch": 0.22668665667166416,
"grad_norm": 0.27266478538513184,
"learning_rate": 9.640631927120177e-05,
"loss": 9.9355,
"step": 189
},
{
"epoch": 0.22788605697151423,
"grad_norm": 0.2777270972728729,
"learning_rate": 9.632611800056201e-05,
"loss": 9.9321,
"step": 190
},
{
"epoch": 0.2290854572713643,
"grad_norm": 0.2846197485923767,
"learning_rate": 9.624506575445429e-05,
"loss": 9.93,
"step": 191
},
{
"epoch": 0.2302848575712144,
"grad_norm": 0.28478533029556274,
"learning_rate": 9.616316402174656e-05,
"loss": 9.9284,
"step": 192
},
{
"epoch": 0.23148425787106447,
"grad_norm": 0.2874702215194702,
"learning_rate": 9.608041430691126e-05,
"loss": 9.9276,
"step": 193
},
{
"epoch": 0.23268365817091455,
"grad_norm": 0.29689356684684753,
"learning_rate": 9.59968181299975e-05,
"loss": 9.9224,
"step": 194
},
{
"epoch": 0.23388305847076463,
"grad_norm": 0.2949802577495575,
"learning_rate": 9.591237702660335e-05,
"loss": 9.9178,
"step": 195
},
{
"epoch": 0.2350824587706147,
"grad_norm": 0.29631638526916504,
"learning_rate": 9.582709254784748e-05,
"loss": 9.9202,
"step": 196
},
{
"epoch": 0.23628185907046476,
"grad_norm": 0.29446399211883545,
"learning_rate": 9.574096626034077e-05,
"loss": 9.9169,
"step": 197
},
{
"epoch": 0.23748125937031483,
"grad_norm": 0.29663321375846863,
"learning_rate": 9.565399974615743e-05,
"loss": 9.9164,
"step": 198
},
{
"epoch": 0.2386806596701649,
"grad_norm": 0.4233105182647705,
"learning_rate": 9.556619460280605e-05,
"loss": 9.9167,
"step": 199
},
{
"epoch": 0.239880059970015,
"grad_norm": 0.902298092842102,
"learning_rate": 9.547755244320012e-05,
"loss": 9.9114,
"step": 200
},
{
"epoch": 0.24107946026986507,
"grad_norm": 0.27147176861763,
"learning_rate": 9.538807489562859e-05,
"loss": 9.9017,
"step": 201
},
{
"epoch": 0.24227886056971515,
"grad_norm": 0.2803528308868408,
"learning_rate": 9.529776360372575e-05,
"loss": 9.8995,
"step": 202
},
{
"epoch": 0.24347826086956523,
"grad_norm": 0.26581478118896484,
"learning_rate": 9.520662022644119e-05,
"loss": 9.9054,
"step": 203
},
{
"epoch": 0.2446776611694153,
"grad_norm": 0.27326396107673645,
"learning_rate": 9.511464643800925e-05,
"loss": 9.8952,
"step": 204
},
{
"epoch": 0.24587706146926536,
"grad_norm": 0.277313768863678,
"learning_rate": 9.502184392791834e-05,
"loss": 9.8951,
"step": 205
},
{
"epoch": 0.24707646176911544,
"grad_norm": 0.27905702590942383,
"learning_rate": 9.492821440087976e-05,
"loss": 9.8936,
"step": 206
},
{
"epoch": 0.2482758620689655,
"grad_norm": 0.28943029046058655,
"learning_rate": 9.48337595767966e-05,
"loss": 9.8861,
"step": 207
},
{
"epoch": 0.2494752623688156,
"grad_norm": 0.2902354896068573,
"learning_rate": 9.473848119073189e-05,
"loss": 9.8864,
"step": 208
},
{
"epoch": 0.25067466266866567,
"grad_norm": 0.2864895462989807,
"learning_rate": 9.4642380992877e-05,
"loss": 9.8891,
"step": 209
},
{
"epoch": 0.2518740629685157,
"grad_norm": 0.29123008251190186,
"learning_rate": 9.454546074851926e-05,
"loss": 9.8855,
"step": 210
},
{
"epoch": 0.25307346326836583,
"grad_norm": 0.30264273285865784,
"learning_rate": 9.44477222380097e-05,
"loss": 9.88,
"step": 211
},
{
"epoch": 0.2542728635682159,
"grad_norm": 0.3108195662498474,
"learning_rate": 9.434916725673024e-05,
"loss": 9.8845,
"step": 212
},
{
"epoch": 0.255472263868066,
"grad_norm": 0.27868786454200745,
"learning_rate": 9.42497976150607e-05,
"loss": 9.8742,
"step": 213
},
{
"epoch": 0.25667166416791604,
"grad_norm": 0.27031615376472473,
"learning_rate": 9.414961513834568e-05,
"loss": 9.8714,
"step": 214
},
{
"epoch": 0.25787106446776614,
"grad_norm": 0.2734402120113373,
"learning_rate": 9.404862166686088e-05,
"loss": 9.8673,
"step": 215
},
{
"epoch": 0.2590704647676162,
"grad_norm": 0.28026947379112244,
"learning_rate": 9.394681905577937e-05,
"loss": 9.8689,
"step": 216
},
{
"epoch": 0.26026986506746624,
"grad_norm": 0.2765568196773529,
"learning_rate": 9.384420917513752e-05,
"loss": 9.871,
"step": 217
},
{
"epoch": 0.26146926536731635,
"grad_norm": 0.28846895694732666,
"learning_rate": 9.374079390980058e-05,
"loss": 9.8626,
"step": 218
},
{
"epoch": 0.2626686656671664,
"grad_norm": 0.28785768151283264,
"learning_rate": 9.363657515942814e-05,
"loss": 9.8594,
"step": 219
},
{
"epoch": 0.2638680659670165,
"grad_norm": 0.28602316975593567,
"learning_rate": 9.353155483843919e-05,
"loss": 9.8568,
"step": 220
},
{
"epoch": 0.26506746626686656,
"grad_norm": 0.2956307530403137,
"learning_rate": 9.342573487597696e-05,
"loss": 9.8599,
"step": 221
},
{
"epoch": 0.26626686656671666,
"grad_norm": 0.28586798906326294,
"learning_rate": 9.331911721587345e-05,
"loss": 9.8601,
"step": 222
},
{
"epoch": 0.2674662668665667,
"grad_norm": 0.30228516459465027,
"learning_rate": 9.321170381661383e-05,
"loss": 9.8549,
"step": 223
},
{
"epoch": 0.26866566716641677,
"grad_norm": 0.3037481904029846,
"learning_rate": 9.310349665130035e-05,
"loss": 9.8593,
"step": 224
},
{
"epoch": 0.2698650674662669,
"grad_norm": 0.3253049850463867,
"learning_rate": 9.299449770761611e-05,
"loss": 9.8551,
"step": 225
},
{
"epoch": 0.2710644677661169,
"grad_norm": 0.26533886790275574,
"learning_rate": 9.288470898778863e-05,
"loss": 9.8453,
"step": 226
},
{
"epoch": 0.27226386806596703,
"grad_norm": 0.2740223705768585,
"learning_rate": 9.277413250855296e-05,
"loss": 9.8406,
"step": 227
},
{
"epoch": 0.2734632683658171,
"grad_norm": 0.27240288257598877,
"learning_rate": 9.266277030111474e-05,
"loss": 9.8468,
"step": 228
},
{
"epoch": 0.2746626686656672,
"grad_norm": 0.33709290623664856,
"learning_rate": 9.255062441111281e-05,
"loss": 9.837,
"step": 229
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.44585081934928894,
"learning_rate": 9.243769689858166e-05,
"loss": 9.8394,
"step": 230
},
{
"epoch": 0.27706146926536734,
"grad_norm": 0.3157913088798523,
"learning_rate": 9.232398983791361e-05,
"loss": 9.8386,
"step": 231
},
{
"epoch": 0.2782608695652174,
"grad_norm": 0.2746964693069458,
"learning_rate": 9.220950531782069e-05,
"loss": 9.8347,
"step": 232
},
{
"epoch": 0.27946026986506745,
"grad_norm": 0.28360387682914734,
"learning_rate": 9.20942454412962e-05,
"loss": 9.8367,
"step": 233
},
{
"epoch": 0.28065967016491755,
"grad_norm": 0.2914506494998932,
"learning_rate": 9.197821232557624e-05,
"loss": 9.8285,
"step": 234
},
{
"epoch": 0.2818590704647676,
"grad_norm": 0.29733598232269287,
"learning_rate": 9.186140810210065e-05,
"loss": 9.8322,
"step": 235
},
{
"epoch": 0.2830584707646177,
"grad_norm": 0.30295151472091675,
"learning_rate": 9.174383491647399e-05,
"loss": 9.8292,
"step": 236
},
{
"epoch": 0.28425787106446776,
"grad_norm": 0.32045435905456543,
"learning_rate": 9.162549492842602e-05,
"loss": 9.8248,
"step": 237
},
{
"epoch": 0.28545727136431787,
"grad_norm": 0.2680381238460541,
"learning_rate": 9.150639031177211e-05,
"loss": 9.8168,
"step": 238
},
{
"epoch": 0.2866566716641679,
"grad_norm": 0.27739235758781433,
"learning_rate": 9.138652325437324e-05,
"loss": 9.8155,
"step": 239
},
{
"epoch": 0.28785607196401797,
"grad_norm": 0.271766722202301,
"learning_rate": 9.12658959580959e-05,
"loss": 9.8195,
"step": 240
},
{
"epoch": 0.2890554722638681,
"grad_norm": 0.28111475706100464,
"learning_rate": 9.114451063877151e-05,
"loss": 9.8112,
"step": 241
},
{
"epoch": 0.2902548725637181,
"grad_norm": 0.27953851222991943,
"learning_rate": 9.102236952615589e-05,
"loss": 9.814,
"step": 242
},
{
"epoch": 0.29145427286356823,
"grad_norm": 0.27629002928733826,
"learning_rate": 9.08994748638881e-05,
"loss": 9.8131,
"step": 243
},
{
"epoch": 0.2926536731634183,
"grad_norm": 0.2811156213283539,
"learning_rate": 9.077582890944945e-05,
"loss": 9.8045,
"step": 244
},
{
"epoch": 0.2938530734632684,
"grad_norm": 0.3269944489002228,
"learning_rate": 9.065143393412179e-05,
"loss": 9.8066,
"step": 245
},
{
"epoch": 0.29505247376311844,
"grad_norm": 0.3400765657424927,
"learning_rate": 9.052629222294604e-05,
"loss": 9.8138,
"step": 246
},
{
"epoch": 0.2962518740629685,
"grad_norm": 0.28989219665527344,
"learning_rate": 9.040040607467999e-05,
"loss": 9.8014,
"step": 247
},
{
"epoch": 0.2974512743628186,
"grad_norm": 0.33483076095581055,
"learning_rate": 9.02737778017562e-05,
"loss": 9.8082,
"step": 248
},
{
"epoch": 0.29865067466266865,
"grad_norm": 0.2836906909942627,
"learning_rate": 9.014640973023951e-05,
"loss": 9.8131,
"step": 249
},
{
"epoch": 0.29985007496251875,
"grad_norm": 0.3271000385284424,
"learning_rate": 9.00183041997843e-05,
"loss": 9.7969,
"step": 250
},
{
"epoch": 0.3010494752623688,
"grad_norm": 0.2659075856208801,
"learning_rate": 8.988946356359146e-05,
"loss": 9.7947,
"step": 251
},
{
"epoch": 0.3022488755622189,
"grad_norm": 0.2756604552268982,
"learning_rate": 8.97598901883653e-05,
"loss": 9.7903,
"step": 252
},
{
"epoch": 0.30344827586206896,
"grad_norm": 0.2782731354236603,
"learning_rate": 8.962958645426989e-05,
"loss": 9.7927,
"step": 253
},
{
"epoch": 0.30464767616191907,
"grad_norm": 0.28496497869491577,
"learning_rate": 8.949855475488549e-05,
"loss": 9.788,
"step": 254
},
{
"epoch": 0.3058470764617691,
"grad_norm": 0.2814723253250122,
"learning_rate": 8.936679749716452e-05,
"loss": 9.7867,
"step": 255
},
{
"epoch": 0.30704647676161917,
"grad_norm": 0.27949169278144836,
"learning_rate": 8.923431710138734e-05,
"loss": 9.7937,
"step": 256
},
{
"epoch": 0.3082458770614693,
"grad_norm": 0.2898804843425751,
"learning_rate": 8.910111600111785e-05,
"loss": 9.783,
"step": 257
},
{
"epoch": 0.3094452773613193,
"grad_norm": 0.2944411337375641,
"learning_rate": 8.896719664315867e-05,
"loss": 9.7809,
"step": 258
},
{
"epoch": 0.31064467766116943,
"grad_norm": 0.29111993312835693,
"learning_rate": 8.883256148750633e-05,
"loss": 9.7834,
"step": 259
},
{
"epoch": 0.3118440779610195,
"grad_norm": 0.3007718324661255,
"learning_rate": 8.869721300730596e-05,
"loss": 9.7882,
"step": 260
},
{
"epoch": 0.3130434782608696,
"grad_norm": 0.28827887773513794,
"learning_rate": 8.856115368880598e-05,
"loss": 9.7902,
"step": 261
},
{
"epoch": 0.31424287856071964,
"grad_norm": 0.31684109568595886,
"learning_rate": 8.842438603131232e-05,
"loss": 9.7778,
"step": 262
},
{
"epoch": 0.3154422788605697,
"grad_norm": 0.32087424397468567,
"learning_rate": 8.828691254714259e-05,
"loss": 9.7689,
"step": 263
},
{
"epoch": 0.3166416791604198,
"grad_norm": 0.27183249592781067,
"learning_rate": 8.814873576157987e-05,
"loss": 9.7738,
"step": 264
},
{
"epoch": 0.31784107946026985,
"grad_norm": 0.27800437808036804,
"learning_rate": 8.800985821282637e-05,
"loss": 9.7711,
"step": 265
},
{
"epoch": 0.31904047976011995,
"grad_norm": 0.28154927492141724,
"learning_rate": 8.787028245195676e-05,
"loss": 9.7662,
"step": 266
},
{
"epoch": 0.32023988005997,
"grad_norm": 0.2830137014389038,
"learning_rate": 8.773001104287137e-05,
"loss": 9.767,
"step": 267
},
{
"epoch": 0.3214392803598201,
"grad_norm": 0.27607715129852295,
"learning_rate": 8.758904656224904e-05,
"loss": 9.7658,
"step": 268
},
{
"epoch": 0.32263868065967016,
"grad_norm": 0.2993113696575165,
"learning_rate": 8.744739159949981e-05,
"loss": 9.7659,
"step": 269
},
{
"epoch": 0.3238380809595202,
"grad_norm": 0.316902220249176,
"learning_rate": 8.730504875671732e-05,
"loss": 9.7573,
"step": 270
},
{
"epoch": 0.3250374812593703,
"grad_norm": 0.3067699670791626,
"learning_rate": 8.716202064863111e-05,
"loss": 9.7598,
"step": 271
},
{
"epoch": 0.32623688155922037,
"grad_norm": 0.3003675937652588,
"learning_rate": 8.701830990255843e-05,
"loss": 9.7639,
"step": 272
},
{
"epoch": 0.3274362818590705,
"grad_norm": 0.2981228232383728,
"learning_rate": 8.687391915835616e-05,
"loss": 9.7576,
"step": 273
},
{
"epoch": 0.32863568215892053,
"grad_norm": 0.2995956242084503,
"learning_rate": 8.672885106837216e-05,
"loss": 9.7714,
"step": 274
},
{
"epoch": 0.32983508245877063,
"grad_norm": 0.30962345004081726,
"learning_rate": 8.658310829739665e-05,
"loss": 9.7645,
"step": 275
},
{
"epoch": 0.3310344827586207,
"grad_norm": 0.26187556982040405,
"learning_rate": 8.643669352261321e-05,
"loss": 9.7506,
"step": 276
},
{
"epoch": 0.3322338830584708,
"grad_norm": 0.276991605758667,
"learning_rate": 8.628960943354965e-05,
"loss": 9.7492,
"step": 277
},
{
"epoch": 0.33343328335832084,
"grad_norm": 0.2857518196105957,
"learning_rate": 8.614185873202851e-05,
"loss": 9.7469,
"step": 278
},
{
"epoch": 0.3346326836581709,
"grad_norm": 0.28504660725593567,
"learning_rate": 8.599344413211755e-05,
"loss": 9.7518,
"step": 279
},
{
"epoch": 0.335832083958021,
"grad_norm": 0.27488988637924194,
"learning_rate": 8.584436836007981e-05,
"loss": 9.7501,
"step": 280
},
{
"epoch": 0.33703148425787105,
"grad_norm": 0.29049110412597656,
"learning_rate": 8.569463415432356e-05,
"loss": 9.7418,
"step": 281
},
{
"epoch": 0.33823088455772116,
"grad_norm": 0.2897820472717285,
"learning_rate": 8.554424426535201e-05,
"loss": 9.7481,
"step": 282
},
{
"epoch": 0.3394302848575712,
"grad_norm": 0.28421247005462646,
"learning_rate": 8.539320145571276e-05,
"loss": 9.7456,
"step": 283
},
{
"epoch": 0.3406296851574213,
"grad_norm": 0.28690865635871887,
"learning_rate": 8.524150849994707e-05,
"loss": 9.7501,
"step": 284
},
{
"epoch": 0.34182908545727136,
"grad_norm": 0.2818574905395508,
"learning_rate": 8.50891681845389e-05,
"loss": 9.7422,
"step": 285
},
{
"epoch": 0.3430284857571214,
"grad_norm": 0.5660536885261536,
"learning_rate": 8.493618330786365e-05,
"loss": 9.7497,
"step": 286
},
{
"epoch": 0.3442278860569715,
"grad_norm": 0.31119802594184875,
"learning_rate": 8.47825566801369e-05,
"loss": 9.7429,
"step": 287
},
{
"epoch": 0.3454272863568216,
"grad_norm": 0.2645992636680603,
"learning_rate": 8.462829112336266e-05,
"loss": 9.7354,
"step": 288
},
{
"epoch": 0.3466266866566717,
"grad_norm": 0.27782142162323,
"learning_rate": 8.44733894712816e-05,
"loss": 9.7309,
"step": 289
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.27114003896713257,
"learning_rate": 8.431785456931898e-05,
"loss": 9.7329,
"step": 290
},
{
"epoch": 0.34902548725637184,
"grad_norm": 0.27776530385017395,
"learning_rate": 8.416168927453236e-05,
"loss": 9.7294,
"step": 291
},
{
"epoch": 0.3502248875562219,
"grad_norm": 0.2819390594959259,
"learning_rate": 8.400489645555914e-05,
"loss": 9.7324,
"step": 292
},
{
"epoch": 0.35142428785607194,
"grad_norm": 0.2786363363265991,
"learning_rate": 8.384747899256386e-05,
"loss": 9.7327,
"step": 293
},
{
"epoch": 0.35262368815592204,
"grad_norm": 0.29060226678848267,
"learning_rate": 8.368943977718528e-05,
"loss": 9.7265,
"step": 294
},
{
"epoch": 0.3538230884557721,
"grad_norm": 0.28789106011390686,
"learning_rate": 8.353078171248335e-05,
"loss": 9.7269,
"step": 295
},
{
"epoch": 0.3550224887556222,
"grad_norm": 0.28383123874664307,
"learning_rate": 8.337150771288572e-05,
"loss": 9.7357,
"step": 296
},
{
"epoch": 0.35622188905547225,
"grad_norm": 0.28761202096939087,
"learning_rate": 8.32116207041343e-05,
"loss": 9.7277,
"step": 297
},
{
"epoch": 0.35742128935532236,
"grad_norm": 0.29686328768730164,
"learning_rate": 8.30511236232316e-05,
"loss": 9.7278,
"step": 298
},
{
"epoch": 0.3586206896551724,
"grad_norm": 0.3019421100616455,
"learning_rate": 8.289001941838659e-05,
"loss": 9.7348,
"step": 299
},
{
"epoch": 0.3598200899550225,
"grad_norm": 0.3201374411582947,
"learning_rate": 8.27283110489607e-05,
"loss": 9.7275,
"step": 300
},
{
"epoch": 0.36101949025487257,
"grad_norm": 0.2733359932899475,
"learning_rate": 8.256600148541339e-05,
"loss": 9.7121,
"step": 301
},
{
"epoch": 0.3622188905547226,
"grad_norm": 0.2780385911464691,
"learning_rate": 8.240309370924759e-05,
"loss": 9.7179,
"step": 302
},
{
"epoch": 0.3634182908545727,
"grad_norm": 0.27753978967666626,
"learning_rate": 8.223959071295493e-05,
"loss": 9.7121,
"step": 303
},
{
"epoch": 0.3646176911544228,
"grad_norm": 0.2738651633262634,
"learning_rate": 8.207549549996083e-05,
"loss": 9.7152,
"step": 304
},
{
"epoch": 0.3658170914542729,
"grad_norm": 0.4075029790401459,
"learning_rate": 8.191081108456921e-05,
"loss": 9.7168,
"step": 305
},
{
"epoch": 0.36701649175412293,
"grad_norm": 0.35438272356987,
"learning_rate": 8.174554049190725e-05,
"loss": 9.7143,
"step": 306
},
{
"epoch": 0.36821589205397304,
"grad_norm": 0.46225133538246155,
"learning_rate": 8.157968675786972e-05,
"loss": 9.7133,
"step": 307
},
{
"epoch": 0.3694152923538231,
"grad_norm": 0.2845197319984436,
"learning_rate": 8.141325292906326e-05,
"loss": 9.7149,
"step": 308
},
{
"epoch": 0.37061469265367314,
"grad_norm": 0.29232627153396606,
"learning_rate": 8.12462420627504e-05,
"loss": 9.7107,
"step": 309
},
{
"epoch": 0.37181409295352325,
"grad_norm": 0.28868958353996277,
"learning_rate": 8.107865722679347e-05,
"loss": 9.7176,
"step": 310
},
{
"epoch": 0.3730134932533733,
"grad_norm": 0.3159126341342926,
"learning_rate": 8.091050149959808e-05,
"loss": 9.713,
"step": 311
},
{
"epoch": 0.3742128935532234,
"grad_norm": 0.3219504952430725,
"learning_rate": 8.074177797005678e-05,
"loss": 9.7166,
"step": 312
},
{
"epoch": 0.37541229385307345,
"grad_norm": 0.2772824168205261,
"learning_rate": 8.057248973749215e-05,
"loss": 9.7027,
"step": 313
},
{
"epoch": 0.37661169415292356,
"grad_norm": 0.2774364948272705,
"learning_rate": 8.040263991159995e-05,
"loss": 9.7026,
"step": 314
},
{
"epoch": 0.3778110944527736,
"grad_norm": 0.2747974693775177,
"learning_rate": 8.0232231612392e-05,
"loss": 9.702,
"step": 315
},
{
"epoch": 0.37901049475262366,
"grad_norm": 0.2756046652793884,
"learning_rate": 8.006126797013883e-05,
"loss": 9.7022,
"step": 316
},
{
"epoch": 0.38020989505247377,
"grad_norm": 0.269083172082901,
"learning_rate": 7.98897521253122e-05,
"loss": 9.7024,
"step": 317
},
{
"epoch": 0.3814092953523238,
"grad_norm": 0.2777722477912903,
"learning_rate": 7.97176872285274e-05,
"loss": 9.7029,
"step": 318
},
{
"epoch": 0.3826086956521739,
"grad_norm": 0.2875417172908783,
"learning_rate": 7.954507644048544e-05,
"loss": 9.7008,
"step": 319
},
{
"epoch": 0.383808095952024,
"grad_norm": 0.29414165019989014,
"learning_rate": 7.937192293191485e-05,
"loss": 9.7004,
"step": 320
},
{
"epoch": 0.3850074962518741,
"grad_norm": 0.2859031558036804,
"learning_rate": 7.919822988351357e-05,
"loss": 9.7048,
"step": 321
},
{
"epoch": 0.38620689655172413,
"grad_norm": 0.2967624068260193,
"learning_rate": 7.902400048589051e-05,
"loss": 9.7018,
"step": 322
},
{
"epoch": 0.38740629685157424,
"grad_norm": 0.40655517578125,
"learning_rate": 7.884923793950685e-05,
"loss": 9.693,
"step": 323
},
{
"epoch": 0.3886056971514243,
"grad_norm": 0.3629460632801056,
"learning_rate": 7.86739454546173e-05,
"loss": 9.7021,
"step": 324
},
{
"epoch": 0.38980509745127434,
"grad_norm": 0.3573906421661377,
"learning_rate": 7.84981262512112e-05,
"loss": 9.7026,
"step": 325
},
{
"epoch": 0.39100449775112445,
"grad_norm": 0.2747887969017029,
"learning_rate": 7.832178355895326e-05,
"loss": 9.6855,
"step": 326
},
{
"epoch": 0.3922038980509745,
"grad_norm": 0.27436476945877075,
"learning_rate": 7.814492061712428e-05,
"loss": 9.6864,
"step": 327
},
{
"epoch": 0.3934032983508246,
"grad_norm": 0.2805567681789398,
"learning_rate": 7.796754067456168e-05,
"loss": 9.6899,
"step": 328
},
{
"epoch": 0.39460269865067465,
"grad_norm": 0.2744491696357727,
"learning_rate": 7.778964698959972e-05,
"loss": 9.6882,
"step": 329
},
{
"epoch": 0.39580209895052476,
"grad_norm": 0.2762869894504547,
"learning_rate": 7.761124283000983e-05,
"loss": 9.6909,
"step": 330
},
{
"epoch": 0.3970014992503748,
"grad_norm": 0.27481362223625183,
"learning_rate": 7.743233147294035e-05,
"loss": 9.6929,
"step": 331
},
{
"epoch": 0.39820089955022486,
"grad_norm": 0.28461942076683044,
"learning_rate": 7.725291620485653e-05,
"loss": 9.6901,
"step": 332
},
{
"epoch": 0.39940029985007497,
"grad_norm": 0.2874203026294708,
"learning_rate": 7.707300032148004e-05,
"loss": 9.6879,
"step": 333
},
{
"epoch": 0.400599700149925,
"grad_norm": 0.2960827052593231,
"learning_rate": 7.689258712772851e-05,
"loss": 9.6883,
"step": 334
},
{
"epoch": 0.4017991004497751,
"grad_norm": 0.2913392186164856,
"learning_rate": 7.671167993765474e-05,
"loss": 9.6886,
"step": 335
},
{
"epoch": 0.4029985007496252,
"grad_norm": 0.2986817955970764,
"learning_rate": 7.653028207438589e-05,
"loss": 9.6875,
"step": 336
},
{
"epoch": 0.4041979010494753,
"grad_norm": 0.31125518679618835,
"learning_rate": 7.634839687006242e-05,
"loss": 9.693,
"step": 337
},
{
"epoch": 0.40539730134932533,
"grad_norm": 0.27948254346847534,
"learning_rate": 7.616602766577683e-05,
"loss": 9.677,
"step": 338
},
{
"epoch": 0.4065967016491754,
"grad_norm": 0.2667854428291321,
"learning_rate": 7.59831778115124e-05,
"loss": 9.6728,
"step": 339
},
{
"epoch": 0.4077961019490255,
"grad_norm": 0.26580169796943665,
"learning_rate": 7.579985066608153e-05,
"loss": 9.6734,
"step": 340
},
{
"epoch": 0.40899550224887554,
"grad_norm": 0.27677300572395325,
"learning_rate": 7.56160495970641e-05,
"loss": 9.6744,
"step": 341
},
{
"epoch": 0.41019490254872565,
"grad_norm": 0.28340858221054077,
"learning_rate": 7.543177798074564e-05,
"loss": 9.6755,
"step": 342
},
{
"epoch": 0.4113943028485757,
"grad_norm": 0.28086498379707336,
"learning_rate": 7.52470392020552e-05,
"loss": 9.6741,
"step": 343
},
{
"epoch": 0.4125937031484258,
"grad_norm": 0.2807992100715637,
"learning_rate": 7.506183665450336e-05,
"loss": 9.6789,
"step": 344
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.27423274517059326,
"learning_rate": 7.487617374011968e-05,
"loss": 9.6791,
"step": 345
},
{
"epoch": 0.41499250374812596,
"grad_norm": 0.2901366353034973,
"learning_rate": 7.469005386939036e-05,
"loss": 9.6742,
"step": 346
},
{
"epoch": 0.416191904047976,
"grad_norm": 0.33871832489967346,
"learning_rate": 7.45034804611955e-05,
"loss": 9.6731,
"step": 347
},
{
"epoch": 0.41739130434782606,
"grad_norm": 0.3808429539203644,
"learning_rate": 7.43164569427464e-05,
"loss": 9.6811,
"step": 348
},
{
"epoch": 0.41859070464767617,
"grad_norm": 0.37340763211250305,
"learning_rate": 7.412898674952248e-05,
"loss": 9.6826,
"step": 349
},
{
"epoch": 0.4197901049475262,
"grad_norm": 0.31507405638694763,
"learning_rate": 7.394107332520828e-05,
"loss": 9.6792,
"step": 350
},
{
"epoch": 0.4209895052473763,
"grad_norm": 0.2747836410999298,
"learning_rate": 7.37527201216301e-05,
"loss": 9.6618,
"step": 351
},
{
"epoch": 0.4221889055472264,
"grad_norm": 0.26785317063331604,
"learning_rate": 7.356393059869272e-05,
"loss": 9.668,
"step": 352
},
{
"epoch": 0.4233883058470765,
"grad_norm": 0.27754732966423035,
"learning_rate": 7.337470822431572e-05,
"loss": 9.6617,
"step": 353
},
{
"epoch": 0.42458770614692654,
"grad_norm": 0.2815973460674286,
"learning_rate": 7.318505647436986e-05,
"loss": 9.6655,
"step": 354
},
{
"epoch": 0.4257871064467766,
"grad_norm": 0.27644169330596924,
"learning_rate": 7.299497883261319e-05,
"loss": 9.6683,
"step": 355
},
{
"epoch": 0.4269865067466267,
"grad_norm": 0.27770036458969116,
"learning_rate": 7.28044787906271e-05,
"loss": 9.6686,
"step": 356
},
{
"epoch": 0.42818590704647674,
"grad_norm": 0.28763288259506226,
"learning_rate": 7.261355984775208e-05,
"loss": 9.6643,
"step": 357
},
{
"epoch": 0.42938530734632685,
"grad_norm": 0.28251275420188904,
"learning_rate": 7.242222551102356e-05,
"loss": 9.6609,
"step": 358
},
{
"epoch": 0.4305847076461769,
"grad_norm": 0.29022759199142456,
"learning_rate": 7.223047929510743e-05,
"loss": 9.6656,
"step": 359
},
{
"epoch": 0.431784107946027,
"grad_norm": 0.2947325110435486,
"learning_rate": 7.20383247222355e-05,
"loss": 9.666,
"step": 360
},
{
"epoch": 0.43298350824587706,
"grad_norm": 0.29398787021636963,
"learning_rate": 7.184576532214077e-05,
"loss": 9.6692,
"step": 361
},
{
"epoch": 0.4341829085457271,
"grad_norm": 0.30600595474243164,
"learning_rate": 7.16528046319926e-05,
"loss": 9.6675,
"step": 362
},
{
"epoch": 0.4353823088455772,
"grad_norm": 0.26775041222572327,
"learning_rate": 7.145944619633176e-05,
"loss": 9.6627,
"step": 363
},
{
"epoch": 0.43658170914542727,
"grad_norm": 0.2672569155693054,
"learning_rate": 7.126569356700529e-05,
"loss": 9.6575,
"step": 364
},
{
"epoch": 0.43778110944527737,
"grad_norm": 0.2710895538330078,
"learning_rate": 7.107155030310126e-05,
"loss": 9.6538,
"step": 365
},
{
"epoch": 0.4389805097451274,
"grad_norm": 0.2715386152267456,
"learning_rate": 7.087701997088345e-05,
"loss": 9.6533,
"step": 366
},
{
"epoch": 0.44017991004497753,
"grad_norm": 0.2757709324359894,
"learning_rate": 7.068210614372568e-05,
"loss": 9.6559,
"step": 367
},
{
"epoch": 0.4413793103448276,
"grad_norm": 0.27887001633644104,
"learning_rate": 7.048681240204641e-05,
"loss": 9.6604,
"step": 368
},
{
"epoch": 0.4425787106446777,
"grad_norm": 0.2874414920806885,
"learning_rate": 7.029114233324276e-05,
"loss": 9.6537,
"step": 369
},
{
"epoch": 0.44377811094452774,
"grad_norm": 0.2877376079559326,
"learning_rate": 7.009509953162471e-05,
"loss": 9.6594,
"step": 370
},
{
"epoch": 0.4449775112443778,
"grad_norm": 0.2886502146720886,
"learning_rate": 6.989868759834908e-05,
"loss": 9.6522,
"step": 371
},
{
"epoch": 0.4461769115442279,
"grad_norm": 0.28471824526786804,
"learning_rate": 6.97019101413533e-05,
"loss": 9.6611,
"step": 372
},
{
"epoch": 0.44737631184407795,
"grad_norm": 0.2849258780479431,
"learning_rate": 6.950477077528926e-05,
"loss": 9.6583,
"step": 373
},
{
"epoch": 0.44857571214392805,
"grad_norm": 0.3675067126750946,
"learning_rate": 6.93072731214568e-05,
"loss": 9.6677,
"step": 374
},
{
"epoch": 0.4497751124437781,
"grad_norm": 0.6856014728546143,
"learning_rate": 6.910942080773724e-05,
"loss": 9.6579,
"step": 375
},
{
"epoch": 0.4509745127436282,
"grad_norm": 0.27253374457359314,
"learning_rate": 6.891121746852674e-05,
"loss": 9.6466,
"step": 376
},
{
"epoch": 0.45217391304347826,
"grad_norm": 0.26753801107406616,
"learning_rate": 6.871266674466955e-05,
"loss": 9.6491,
"step": 377
},
{
"epoch": 0.4533733133433283,
"grad_norm": 0.2716609239578247,
"learning_rate": 6.851377228339106e-05,
"loss": 9.6484,
"step": 378
},
{
"epoch": 0.4545727136431784,
"grad_norm": 0.2831350266933441,
"learning_rate": 6.831453773823091e-05,
"loss": 9.6464,
"step": 379
},
{
"epoch": 0.45577211394302847,
"grad_norm": 0.28324607014656067,
"learning_rate": 6.811496676897578e-05,
"loss": 9.6475,
"step": 380
},
{
"epoch": 0.4569715142428786,
"grad_norm": 0.27571067214012146,
"learning_rate": 6.791506304159221e-05,
"loss": 9.645,
"step": 381
},
{
"epoch": 0.4581709145427286,
"grad_norm": 0.28332218527793884,
"learning_rate": 6.771483022815925e-05,
"loss": 9.6559,
"step": 382
},
{
"epoch": 0.45937031484257873,
"grad_norm": 0.2815491855144501,
"learning_rate": 6.751427200680108e-05,
"loss": 9.6518,
"step": 383
},
{
"epoch": 0.4605697151424288,
"grad_norm": 0.28399744629859924,
"learning_rate": 6.731339206161928e-05,
"loss": 9.6512,
"step": 384
},
{
"epoch": 0.4617691154422789,
"grad_norm": 0.288397878408432,
"learning_rate": 6.711219408262527e-05,
"loss": 9.6452,
"step": 385
},
{
"epoch": 0.46296851574212894,
"grad_norm": 0.29081466794013977,
"learning_rate": 6.691068176567257e-05,
"loss": 9.66,
"step": 386
},
{
"epoch": 0.464167916041979,
"grad_norm": 0.29959914088249207,
"learning_rate": 6.670885881238877e-05,
"loss": 9.6601,
"step": 387
},
{
"epoch": 0.4653673163418291,
"grad_norm": 0.27491918206214905,
"learning_rate": 6.650672893010768e-05,
"loss": 9.6448,
"step": 388
},
{
"epoch": 0.46656671664167915,
"grad_norm": 0.2780735194683075,
"learning_rate": 6.630429583180112e-05,
"loss": 9.6355,
"step": 389
},
{
"epoch": 0.46776611694152925,
"grad_norm": 0.2666113078594208,
"learning_rate": 6.610156323601075e-05,
"loss": 9.6384,
"step": 390
},
{
"epoch": 0.4689655172413793,
"grad_norm": 0.2784653902053833,
"learning_rate": 6.589853486677981e-05,
"loss": 9.6384,
"step": 391
},
{
"epoch": 0.4701649175412294,
"grad_norm": 0.28066155314445496,
"learning_rate": 6.569521445358464e-05,
"loss": 9.6417,
"step": 392
},
{
"epoch": 0.47136431784107946,
"grad_norm": 0.27688291668891907,
"learning_rate": 6.549160573126623e-05,
"loss": 9.6387,
"step": 393
},
{
"epoch": 0.4725637181409295,
"grad_norm": 0.279243141412735,
"learning_rate": 6.528771243996157e-05,
"loss": 9.645,
"step": 394
},
{
"epoch": 0.4737631184407796,
"grad_norm": 0.2818412184715271,
"learning_rate": 6.508353832503494e-05,
"loss": 9.6442,
"step": 395
},
{
"epoch": 0.47496251874062967,
"grad_norm": 0.2805486023426056,
"learning_rate": 6.48790871370092e-05,
"loss": 9.6417,
"step": 396
},
{
"epoch": 0.4761619190404798,
"grad_norm": 0.2866521179676056,
"learning_rate": 6.467436263149678e-05,
"loss": 9.6496,
"step": 397
},
{
"epoch": 0.4773613193403298,
"grad_norm": 0.29199638962745667,
"learning_rate": 6.446936856913078e-05,
"loss": 9.6433,
"step": 398
},
{
"epoch": 0.47856071964017993,
"grad_norm": 0.29411137104034424,
"learning_rate": 6.426410871549581e-05,
"loss": 9.6499,
"step": 399
},
{
"epoch": 0.47976011994003,
"grad_norm": 0.3119426369667053,
"learning_rate": 6.405858684105892e-05,
"loss": 9.655,
"step": 400
},
{
"epoch": 0.48095952023988003,
"grad_norm": 0.2667071521282196,
"learning_rate": 6.385280672110024e-05,
"loss": 9.6329,
"step": 401
},
{
"epoch": 0.48215892053973014,
"grad_norm": 0.2752053737640381,
"learning_rate": 6.364677213564365e-05,
"loss": 9.6306,
"step": 402
},
{
"epoch": 0.4833583208395802,
"grad_norm": 0.27557632327079773,
"learning_rate": 6.344048686938745e-05,
"loss": 9.6324,
"step": 403
},
{
"epoch": 0.4845577211394303,
"grad_norm": 0.2749324142932892,
"learning_rate": 6.323395471163467e-05,
"loss": 9.639,
"step": 404
},
{
"epoch": 0.48575712143928035,
"grad_norm": 0.27776116132736206,
"learning_rate": 6.30271794562236e-05,
"loss": 9.6358,
"step": 405
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.27685850858688354,
"learning_rate": 6.282016490145803e-05,
"loss": 9.6354,
"step": 406
},
{
"epoch": 0.4881559220389805,
"grad_norm": 0.2896622121334076,
"learning_rate": 6.261291485003751e-05,
"loss": 9.6398,
"step": 407
},
{
"epoch": 0.4893553223388306,
"grad_norm": 0.34289979934692383,
"learning_rate": 6.240543310898746e-05,
"loss": 9.6447,
"step": 408
},
{
"epoch": 0.49055472263868066,
"grad_norm": 0.30854368209838867,
"learning_rate": 6.219772348958927e-05,
"loss": 9.6312,
"step": 409
},
{
"epoch": 0.4917541229385307,
"grad_norm": 0.3343330919742584,
"learning_rate": 6.198978980731034e-05,
"loss": 9.6383,
"step": 410
},
{
"epoch": 0.4929535232383808,
"grad_norm": 0.2979169487953186,
"learning_rate": 6.178163588173381e-05,
"loss": 9.6352,
"step": 411
},
{
"epoch": 0.49415292353823087,
"grad_norm": 0.29163146018981934,
"learning_rate": 6.157326553648862e-05,
"loss": 9.6349,
"step": 412
},
{
"epoch": 0.495352323838081,
"grad_norm": 0.32137271761894226,
"learning_rate": 6.136468259917917e-05,
"loss": 9.6287,
"step": 413
},
{
"epoch": 0.496551724137931,
"grad_norm": 0.26861318945884705,
"learning_rate": 6.115589090131497e-05,
"loss": 9.6261,
"step": 414
},
{
"epoch": 0.49775112443778113,
"grad_norm": 0.2670891582965851,
"learning_rate": 6.094689427824031e-05,
"loss": 9.6272,
"step": 415
},
{
"epoch": 0.4989505247376312,
"grad_norm": 0.2772047519683838,
"learning_rate": 6.073769656906385e-05,
"loss": 9.6257,
"step": 416
},
{
"epoch": 0.5001499250374812,
"grad_norm": 0.27719056606292725,
"learning_rate": 6.052830161658799e-05,
"loss": 9.6287,
"step": 417
},
{
"epoch": 0.5013493253373313,
"grad_norm": 0.27757909893989563,
"learning_rate": 6.031871326723837e-05,
"loss": 9.6331,
"step": 418
},
{
"epoch": 0.5025487256371814,
"grad_norm": 0.28167879581451416,
"learning_rate": 6.010893537099316e-05,
"loss": 9.6289,
"step": 419
},
{
"epoch": 0.5037481259370314,
"grad_norm": 0.28175151348114014,
"learning_rate": 5.9898971781312384e-05,
"loss": 9.6342,
"step": 420
},
{
"epoch": 0.5049475262368815,
"grad_norm": 0.2766707241535187,
"learning_rate": 5.9688826355067105e-05,
"loss": 9.6337,
"step": 421
},
{
"epoch": 0.5061469265367317,
"grad_norm": 0.29078051447868347,
"learning_rate": 5.9478502952468595e-05,
"loss": 9.6292,
"step": 422
},
{
"epoch": 0.5073463268365818,
"grad_norm": 0.2969301640987396,
"learning_rate": 5.92680054369974e-05,
"loss": 9.6297,
"step": 423
},
{
"epoch": 0.5085457271364318,
"grad_norm": 0.29746097326278687,
"learning_rate": 5.905733767533238e-05,
"loss": 9.6367,
"step": 424
},
{
"epoch": 0.5097451274362819,
"grad_norm": 0.32283881306648254,
"learning_rate": 5.8846503537279715e-05,
"loss": 9.6347,
"step": 425
},
{
"epoch": 0.510944527736132,
"grad_norm": 0.2667362689971924,
"learning_rate": 5.863550689570179e-05,
"loss": 9.6198,
"step": 426
},
{
"epoch": 0.512143928035982,
"grad_norm": 0.274853378534317,
"learning_rate": 5.842435162644601e-05,
"loss": 9.6217,
"step": 427
},
{
"epoch": 0.5133433283358321,
"grad_norm": 0.2751438319683075,
"learning_rate": 5.821304160827371e-05,
"loss": 9.6246,
"step": 428
},
{
"epoch": 0.5145427286356822,
"grad_norm": 0.279313325881958,
"learning_rate": 5.8001580722788795e-05,
"loss": 9.6222,
"step": 429
},
{
"epoch": 0.5157421289355323,
"grad_norm": 0.27348318696022034,
"learning_rate": 5.7789972854366536e-05,
"loss": 9.6226,
"step": 430
},
{
"epoch": 0.5169415292353823,
"grad_norm": 0.2807561457157135,
"learning_rate": 5.757822189008214e-05,
"loss": 9.6246,
"step": 431
},
{
"epoch": 0.5181409295352324,
"grad_norm": 0.28267139196395874,
"learning_rate": 5.7366331719639366e-05,
"loss": 9.6234,
"step": 432
},
{
"epoch": 0.5193403298350825,
"grad_norm": 0.28372693061828613,
"learning_rate": 5.715430623529909e-05,
"loss": 9.6304,
"step": 433
},
{
"epoch": 0.5205397301349325,
"grad_norm": 0.2916060984134674,
"learning_rate": 5.6942149331807836e-05,
"loss": 9.6256,
"step": 434
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.30614909529685974,
"learning_rate": 5.6729864906326136e-05,
"loss": 9.6258,
"step": 435
},
{
"epoch": 0.5229385307346327,
"grad_norm": 0.32992836833000183,
"learning_rate": 5.651745685835707e-05,
"loss": 9.6317,
"step": 436
},
{
"epoch": 0.5241379310344828,
"grad_norm": 0.3490801751613617,
"learning_rate": 5.630492908967451e-05,
"loss": 9.6334,
"step": 437
},
{
"epoch": 0.5253373313343328,
"grad_norm": 0.2723431885242462,
"learning_rate": 5.609228550425154e-05,
"loss": 9.6261,
"step": 438
},
{
"epoch": 0.5265367316341829,
"grad_norm": 0.26900023221969604,
"learning_rate": 5.5879530008188716e-05,
"loss": 9.6217,
"step": 439
},
{
"epoch": 0.527736131934033,
"grad_norm": 0.26944032311439514,
"learning_rate": 5.566666650964228e-05,
"loss": 9.6232,
"step": 440
},
{
"epoch": 0.528935532233883,
"grad_norm": 0.2752003073692322,
"learning_rate": 5.545369891875241e-05,
"loss": 9.6213,
"step": 441
},
{
"epoch": 0.5301349325337331,
"grad_norm": 0.27395889163017273,
"learning_rate": 5.524063114757139e-05,
"loss": 9.6238,
"step": 442
},
{
"epoch": 0.5313343328335832,
"grad_norm": 0.2798815965652466,
"learning_rate": 5.5027467109991705e-05,
"loss": 9.6211,
"step": 443
},
{
"epoch": 0.5325337331334333,
"grad_norm": 0.2802503705024719,
"learning_rate": 5.481421072167423e-05,
"loss": 9.6214,
"step": 444
},
{
"epoch": 0.5337331334332833,
"grad_norm": 0.2843863368034363,
"learning_rate": 5.4600865899976225e-05,
"loss": 9.6235,
"step": 445
},
{
"epoch": 0.5349325337331334,
"grad_norm": 0.28727102279663086,
"learning_rate": 5.43874365638794e-05,
"loss": 9.6231,
"step": 446
},
{
"epoch": 0.5361319340329835,
"grad_norm": 0.2915343940258026,
"learning_rate": 5.417392663391796e-05,
"loss": 9.6246,
"step": 447
},
{
"epoch": 0.5373313343328335,
"grad_norm": 0.295317679643631,
"learning_rate": 5.3960340032106515e-05,
"loss": 9.6214,
"step": 448
},
{
"epoch": 0.5385307346326836,
"grad_norm": 0.29819992184638977,
"learning_rate": 5.374668068186809e-05,
"loss": 9.6253,
"step": 449
},
{
"epoch": 0.5397301349325337,
"grad_norm": 0.31375864148139954,
"learning_rate": 5.3532952507962066e-05,
"loss": 9.6318,
"step": 450
},
{
"epoch": 0.5409295352323839,
"grad_norm": 0.27647456526756287,
"learning_rate": 5.3319159436412046e-05,
"loss": 9.616,
"step": 451
},
{
"epoch": 0.5421289355322338,
"grad_norm": 0.27085641026496887,
"learning_rate": 5.310530539443375e-05,
"loss": 9.6163,
"step": 452
},
{
"epoch": 0.543328335832084,
"grad_norm": 0.2696115970611572,
"learning_rate": 5.28913943103629e-05,
"loss": 9.6192,
"step": 453
},
{
"epoch": 0.5445277361319341,
"grad_norm": 0.281088650226593,
"learning_rate": 5.2677430113583005e-05,
"loss": 9.6158,
"step": 454
},
{
"epoch": 0.545727136431784,
"grad_norm": 0.27259501814842224,
"learning_rate": 5.246341673445323e-05,
"loss": 9.6236,
"step": 455
},
{
"epoch": 0.5469265367316342,
"grad_norm": 0.2712749242782593,
"learning_rate": 5.22493581042362e-05,
"loss": 9.6263,
"step": 456
},
{
"epoch": 0.5481259370314843,
"grad_norm": 0.2738138437271118,
"learning_rate": 5.203525815502574e-05,
"loss": 9.6257,
"step": 457
},
{
"epoch": 0.5493253373313344,
"grad_norm": 0.2845352292060852,
"learning_rate": 5.182112081967466e-05,
"loss": 9.6207,
"step": 458
},
{
"epoch": 0.5505247376311844,
"grad_norm": 0.2933952510356903,
"learning_rate": 5.160695003172259e-05,
"loss": 9.6218,
"step": 459
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.32854798436164856,
"learning_rate": 5.13927497253236e-05,
"loss": 9.6258,
"step": 460
},
{
"epoch": 0.5529235382308846,
"grad_norm": 0.32506442070007324,
"learning_rate": 5.1178523835174e-05,
"loss": 9.6305,
"step": 461
},
{
"epoch": 0.5541229385307347,
"grad_norm": 0.31355130672454834,
"learning_rate": 5.0964276296440075e-05,
"loss": 9.6294,
"step": 462
},
{
"epoch": 0.5553223388305847,
"grad_norm": 0.27378547191619873,
"learning_rate": 5.075001104468576e-05,
"loss": 9.6173,
"step": 463
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.2689533829689026,
"learning_rate": 5.053573201580039e-05,
"loss": 9.6162,
"step": 464
},
{
"epoch": 0.5577211394302849,
"grad_norm": 0.27102944254875183,
"learning_rate": 5.032144314592633e-05,
"loss": 9.6134,
"step": 465
},
{
"epoch": 0.5589205397301349,
"grad_norm": 0.2733670473098755,
"learning_rate": 5.010714837138675e-05,
"loss": 9.6183,
"step": 466
},
{
"epoch": 0.560119940029985,
"grad_norm": 0.2813307046890259,
"learning_rate": 4.989285162861326e-05,
"loss": 9.6184,
"step": 467
},
{
"epoch": 0.5613193403298351,
"grad_norm": 0.2758241593837738,
"learning_rate": 4.967855685407368e-05,
"loss": 9.6172,
"step": 468
},
{
"epoch": 0.5625187406296852,
"grad_norm": 0.28401094675064087,
"learning_rate": 4.946426798419962e-05,
"loss": 9.616,
"step": 469
},
{
"epoch": 0.5637181409295352,
"grad_norm": 0.28821876645088196,
"learning_rate": 4.924998895531425e-05,
"loss": 9.6195,
"step": 470
},
{
"epoch": 0.5649175412293853,
"grad_norm": 0.2845361828804016,
"learning_rate": 4.903572370355993e-05,
"loss": 9.6186,
"step": 471
},
{
"epoch": 0.5661169415292354,
"grad_norm": 0.29093310236930847,
"learning_rate": 4.882147616482602e-05,
"loss": 9.619,
"step": 472
},
{
"epoch": 0.5673163418290854,
"grad_norm": 0.28855013847351074,
"learning_rate": 4.8607250274676415e-05,
"loss": 9.6224,
"step": 473
},
{
"epoch": 0.5685157421289355,
"grad_norm": 0.3042353689670563,
"learning_rate": 4.839304996827741e-05,
"loss": 9.6186,
"step": 474
},
{
"epoch": 0.5697151424287856,
"grad_norm": 0.32164472341537476,
"learning_rate": 4.817887918032535e-05,
"loss": 9.6202,
"step": 475
},
{
"epoch": 0.5709145427286357,
"grad_norm": 0.26481005549430847,
"learning_rate": 4.7964741844974275e-05,
"loss": 9.6097,
"step": 476
},
{
"epoch": 0.5721139430284857,
"grad_norm": 0.2751154601573944,
"learning_rate": 4.775064189576381e-05,
"loss": 9.6077,
"step": 477
},
{
"epoch": 0.5733133433283358,
"grad_norm": 0.26990050077438354,
"learning_rate": 4.7536583265546775e-05,
"loss": 9.609,
"step": 478
},
{
"epoch": 0.5745127436281859,
"grad_norm": 0.2816186845302582,
"learning_rate": 4.7322569886417006e-05,
"loss": 9.6101,
"step": 479
},
{
"epoch": 0.5757121439280359,
"grad_norm": 0.2793320417404175,
"learning_rate": 4.71086056896371e-05,
"loss": 9.6206,
"step": 480
},
{
"epoch": 0.576911544227886,
"grad_norm": 0.2865123748779297,
"learning_rate": 4.689469460556626e-05,
"loss": 9.6109,
"step": 481
},
{
"epoch": 0.5781109445277361,
"grad_norm": 0.2744526267051697,
"learning_rate": 4.6680840563587966e-05,
"loss": 9.6222,
"step": 482
},
{
"epoch": 0.5793103448275863,
"grad_norm": 0.30048128962516785,
"learning_rate": 4.646704749203793e-05,
"loss": 9.6182,
"step": 483
},
{
"epoch": 0.5805097451274362,
"grad_norm": 0.29160621762275696,
"learning_rate": 4.6253319318131926e-05,
"loss": 9.618,
"step": 484
},
{
"epoch": 0.5817091454272864,
"grad_norm": 0.31267857551574707,
"learning_rate": 4.60396599678935e-05,
"loss": 9.622,
"step": 485
},
{
"epoch": 0.5829085457271365,
"grad_norm": 0.3598839044570923,
"learning_rate": 4.582607336608205e-05,
"loss": 9.6176,
"step": 486
},
{
"epoch": 0.5841079460269865,
"grad_norm": 0.33458805084228516,
"learning_rate": 4.561256343612061e-05,
"loss": 9.6256,
"step": 487
},
{
"epoch": 0.5853073463268366,
"grad_norm": 0.27461162209510803,
"learning_rate": 4.539913410002378e-05,
"loss": 9.6119,
"step": 488
},
{
"epoch": 0.5865067466266867,
"grad_norm": 0.2723887264728546,
"learning_rate": 4.518578927832577e-05,
"loss": 9.6056,
"step": 489
},
{
"epoch": 0.5877061469265368,
"grad_norm": 0.2768537998199463,
"learning_rate": 4.4972532890008313e-05,
"loss": 9.6079,
"step": 490
},
{
"epoch": 0.5889055472263868,
"grad_norm": 0.2774599492549896,
"learning_rate": 4.4759368852428625e-05,
"loss": 9.6092,
"step": 491
},
{
"epoch": 0.5901049475262369,
"grad_norm": 0.27346640825271606,
"learning_rate": 4.45463010812476e-05,
"loss": 9.6143,
"step": 492
},
{
"epoch": 0.591304347826087,
"grad_norm": 0.2797171175479889,
"learning_rate": 4.433333349035773e-05,
"loss": 9.6168,
"step": 493
},
{
"epoch": 0.592503748125937,
"grad_norm": 0.2800818085670471,
"learning_rate": 4.4120469991811296e-05,
"loss": 9.6165,
"step": 494
},
{
"epoch": 0.5937031484257871,
"grad_norm": 0.280519038438797,
"learning_rate": 4.390771449574846e-05,
"loss": 9.6195,
"step": 495
},
{
"epoch": 0.5949025487256372,
"grad_norm": 0.2884778678417206,
"learning_rate": 4.369507091032551e-05,
"loss": 9.6132,
"step": 496
},
{
"epoch": 0.5961019490254873,
"grad_norm": 0.2894138693809509,
"learning_rate": 4.3482543141642943e-05,
"loss": 9.6147,
"step": 497
},
{
"epoch": 0.5973013493253373,
"grad_norm": 0.2868705093860626,
"learning_rate": 4.327013509367386e-05,
"loss": 9.6242,
"step": 498
},
{
"epoch": 0.5985007496251874,
"grad_norm": 0.2994021773338318,
"learning_rate": 4.305785066819218e-05,
"loss": 9.6189,
"step": 499
},
{
"epoch": 0.5997001499250375,
"grad_norm": 0.3168644607067108,
"learning_rate": 4.2845693764700914e-05,
"loss": 9.6247,
"step": 500
},
{
"epoch": 0.6008995502248875,
"grad_norm": 0.26666632294654846,
"learning_rate": 4.263366828036065e-05,
"loss": 9.6057,
"step": 501
},
{
"epoch": 0.6020989505247376,
"grad_norm": 0.26327091455459595,
"learning_rate": 4.242177810991789e-05,
"loss": 9.6115,
"step": 502
},
{
"epoch": 0.6032983508245877,
"grad_norm": 0.27538183331489563,
"learning_rate": 4.221002714563347e-05,
"loss": 9.6082,
"step": 503
},
{
"epoch": 0.6044977511244378,
"grad_norm": 0.27597832679748535,
"learning_rate": 4.19984192772112e-05,
"loss": 9.6075,
"step": 504
},
{
"epoch": 0.6056971514242878,
"grad_norm": 0.28365880250930786,
"learning_rate": 4.1786958391726314e-05,
"loss": 9.6136,
"step": 505
},
{
"epoch": 0.6068965517241379,
"grad_norm": 0.2802659273147583,
"learning_rate": 4.1575648373554e-05,
"loss": 9.6158,
"step": 506
},
{
"epoch": 0.608095952023988,
"grad_norm": 0.2841864228248596,
"learning_rate": 4.136449310429822e-05,
"loss": 9.6115,
"step": 507
},
{
"epoch": 0.6092953523238381,
"grad_norm": 0.2928536832332611,
"learning_rate": 4.115349646272029e-05,
"loss": 9.6156,
"step": 508
},
{
"epoch": 0.6104947526236881,
"grad_norm": 0.2854699492454529,
"learning_rate": 4.0942662324667627e-05,
"loss": 9.6137,
"step": 509
},
{
"epoch": 0.6116941529235382,
"grad_norm": 0.29192522168159485,
"learning_rate": 4.0731994563002606e-05,
"loss": 9.6136,
"step": 510
},
{
"epoch": 0.6128935532233883,
"grad_norm": 0.3441016674041748,
"learning_rate": 4.052149704753142e-05,
"loss": 9.6224,
"step": 511
},
{
"epoch": 0.6140929535232383,
"grad_norm": 0.3597991466522217,
"learning_rate": 4.03111736449329e-05,
"loss": 9.6219,
"step": 512
},
{
"epoch": 0.6152923538230884,
"grad_norm": 0.2781412899494171,
"learning_rate": 4.010102821868762e-05,
"loss": 9.6056,
"step": 513
},
{
"epoch": 0.6164917541229386,
"grad_norm": 0.27280357480049133,
"learning_rate": 3.989106462900686e-05,
"loss": 9.6063,
"step": 514
},
{
"epoch": 0.6176911544227887,
"grad_norm": 0.27366748452186584,
"learning_rate": 3.968128673276165e-05,
"loss": 9.6104,
"step": 515
},
{
"epoch": 0.6188905547226387,
"grad_norm": 0.27588704228401184,
"learning_rate": 3.947169838341202e-05,
"loss": 9.605,
"step": 516
},
{
"epoch": 0.6200899550224888,
"grad_norm": 0.27753859758377075,
"learning_rate": 3.9262303430936164e-05,
"loss": 9.6033,
"step": 517
},
{
"epoch": 0.6212893553223389,
"grad_norm": 0.27255064249038696,
"learning_rate": 3.9053105721759696e-05,
"loss": 9.6098,
"step": 518
},
{
"epoch": 0.6224887556221889,
"grad_norm": 0.2782951295375824,
"learning_rate": 3.8844109098685045e-05,
"loss": 9.6184,
"step": 519
},
{
"epoch": 0.623688155922039,
"grad_norm": 0.28660768270492554,
"learning_rate": 3.8635317400820855e-05,
"loss": 9.6113,
"step": 520
},
{
"epoch": 0.6248875562218891,
"grad_norm": 0.28494128584861755,
"learning_rate": 3.842673446351138e-05,
"loss": 9.6105,
"step": 521
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.28198301792144775,
"learning_rate": 3.82183641182662e-05,
"loss": 9.626,
"step": 522
},
{
"epoch": 0.6272863568215892,
"grad_norm": 0.2875995337963104,
"learning_rate": 3.801021019268969e-05,
"loss": 9.6176,
"step": 523
},
{
"epoch": 0.6284857571214393,
"grad_norm": 0.2956449091434479,
"learning_rate": 3.780227651041073e-05,
"loss": 9.6229,
"step": 524
},
{
"epoch": 0.6296851574212894,
"grad_norm": 0.37847524881362915,
"learning_rate": 3.7594566891012546e-05,
"loss": 9.6214,
"step": 525
},
{
"epoch": 0.6308845577211394,
"grad_norm": 0.27030348777770996,
"learning_rate": 3.7387085149962507e-05,
"loss": 9.6011,
"step": 526
},
{
"epoch": 0.6320839580209895,
"grad_norm": 0.274962455034256,
"learning_rate": 3.717983509854198e-05,
"loss": 9.6023,
"step": 527
},
{
"epoch": 0.6332833583208396,
"grad_norm": 0.27726373076438904,
"learning_rate": 3.69728205437764e-05,
"loss": 9.6102,
"step": 528
},
{
"epoch": 0.6344827586206897,
"grad_norm": 0.27569401264190674,
"learning_rate": 3.676604528836535e-05,
"loss": 9.6077,
"step": 529
},
{
"epoch": 0.6356821589205397,
"grad_norm": 0.2719118893146515,
"learning_rate": 3.6559513130612565e-05,
"loss": 9.6078,
"step": 530
},
{
"epoch": 0.6368815592203898,
"grad_norm": 0.27930060029029846,
"learning_rate": 3.635322786435635e-05,
"loss": 9.6099,
"step": 531
},
{
"epoch": 0.6380809595202399,
"grad_norm": 0.2761722505092621,
"learning_rate": 3.614719327889978e-05,
"loss": 9.6161,
"step": 532
},
{
"epoch": 0.6392803598200899,
"grad_norm": 0.2825543284416199,
"learning_rate": 3.594141315894108e-05,
"loss": 9.616,
"step": 533
},
{
"epoch": 0.64047976011994,
"grad_norm": 0.28519946336746216,
"learning_rate": 3.573589128450418e-05,
"loss": 9.6134,
"step": 534
},
{
"epoch": 0.6416791604197901,
"grad_norm": 0.2859567105770111,
"learning_rate": 3.5530631430869234e-05,
"loss": 9.6181,
"step": 535
},
{
"epoch": 0.6428785607196402,
"grad_norm": 0.293560653924942,
"learning_rate": 3.532563736850322e-05,
"loss": 9.6141,
"step": 536
},
{
"epoch": 0.6440779610194902,
"grad_norm": 0.31543228030204773,
"learning_rate": 3.512091286299081e-05,
"loss": 9.6132,
"step": 537
},
{
"epoch": 0.6452773613193403,
"grad_norm": 0.28361520171165466,
"learning_rate": 3.491646167496507e-05,
"loss": 9.5993,
"step": 538
},
{
"epoch": 0.6464767616191904,
"grad_norm": 0.2670563757419586,
"learning_rate": 3.4712287560038446e-05,
"loss": 9.6042,
"step": 539
},
{
"epoch": 0.6476761619190404,
"grad_norm": 0.2657446265220642,
"learning_rate": 3.450839426873378e-05,
"loss": 9.6106,
"step": 540
},
{
"epoch": 0.6488755622188905,
"grad_norm": 0.271816611289978,
"learning_rate": 3.4304785546415374e-05,
"loss": 9.608,
"step": 541
},
{
"epoch": 0.6500749625187406,
"grad_norm": 0.27191296219825745,
"learning_rate": 3.41014651332202e-05,
"loss": 9.6103,
"step": 542
},
{
"epoch": 0.6512743628185907,
"grad_norm": 0.27644070982933044,
"learning_rate": 3.3898436763989247e-05,
"loss": 9.6039,
"step": 543
},
{
"epoch": 0.6524737631184407,
"grad_norm": 0.27742430567741394,
"learning_rate": 3.369570416819889e-05,
"loss": 9.6053,
"step": 544
},
{
"epoch": 0.6536731634182908,
"grad_norm": 0.2793113589286804,
"learning_rate": 3.349327106989232e-05,
"loss": 9.615,
"step": 545
},
{
"epoch": 0.654872563718141,
"grad_norm": 0.28077057003974915,
"learning_rate": 3.329114118761123e-05,
"loss": 9.6101,
"step": 546
},
{
"epoch": 0.656071964017991,
"grad_norm": 0.2894865870475769,
"learning_rate": 3.308931823432744e-05,
"loss": 9.6093,
"step": 547
},
{
"epoch": 0.6572713643178411,
"grad_norm": 0.2894723415374756,
"learning_rate": 3.288780591737474e-05,
"loss": 9.6141,
"step": 548
},
{
"epoch": 0.6584707646176912,
"grad_norm": 0.3010658323764801,
"learning_rate": 3.268660793838074e-05,
"loss": 9.6249,
"step": 549
},
{
"epoch": 0.6596701649175413,
"grad_norm": 0.3542385399341583,
"learning_rate": 3.2485727993198945e-05,
"loss": 9.6182,
"step": 550
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.2821604907512665,
"learning_rate": 3.228516977184075e-05,
"loss": 9.6229,
"step": 551
},
{
"epoch": 0.6620689655172414,
"grad_norm": 0.27113085985183716,
"learning_rate": 3.2084936958407805e-05,
"loss": 9.6041,
"step": 552
},
{
"epoch": 0.6632683658170915,
"grad_norm": 0.26982516050338745,
"learning_rate": 3.188503323102425e-05,
"loss": 9.6084,
"step": 553
},
{
"epoch": 0.6644677661169416,
"grad_norm": 0.2756569981575012,
"learning_rate": 3.1685462261769105e-05,
"loss": 9.6126,
"step": 554
},
{
"epoch": 0.6656671664167916,
"grad_norm": 0.27629488706588745,
"learning_rate": 3.1486227716608946e-05,
"loss": 9.6056,
"step": 555
},
{
"epoch": 0.6668665667166417,
"grad_norm": 0.28036460280418396,
"learning_rate": 3.128733325533047e-05,
"loss": 9.6054,
"step": 556
},
{
"epoch": 0.6680659670164918,
"grad_norm": 0.27844056487083435,
"learning_rate": 3.1088782531473266e-05,
"loss": 9.6111,
"step": 557
},
{
"epoch": 0.6692653673163418,
"grad_norm": 0.2862386405467987,
"learning_rate": 3.089057919226277e-05,
"loss": 9.612,
"step": 558
},
{
"epoch": 0.6704647676161919,
"grad_norm": 0.2859496474266052,
"learning_rate": 3.069272687854322e-05,
"loss": 9.6114,
"step": 559
},
{
"epoch": 0.671664167916042,
"grad_norm": 0.28554123640060425,
"learning_rate": 3.049522922471075e-05,
"loss": 9.6105,
"step": 560
},
{
"epoch": 0.6728635682158921,
"grad_norm": 0.30089861154556274,
"learning_rate": 3.02980898586467e-05,
"loss": 9.6205,
"step": 561
},
{
"epoch": 0.6740629685157421,
"grad_norm": 0.30331140756607056,
"learning_rate": 3.0101312401650937e-05,
"loss": 9.6158,
"step": 562
},
{
"epoch": 0.6752623688155922,
"grad_norm": 0.2732248902320862,
"learning_rate": 2.9904900468375297e-05,
"loss": 9.6064,
"step": 563
},
{
"epoch": 0.6764617691154423,
"grad_norm": 0.27510005235671997,
"learning_rate": 2.9708857666757246e-05,
"loss": 9.6019,
"step": 564
},
{
"epoch": 0.6776611694152923,
"grad_norm": 0.27365824580192566,
"learning_rate": 2.9513187597953607e-05,
"loss": 9.5995,
"step": 565
},
{
"epoch": 0.6788605697151424,
"grad_norm": 0.2792357802391052,
"learning_rate": 2.931789385627433e-05,
"loss": 9.606,
"step": 566
},
{
"epoch": 0.6800599700149925,
"grad_norm": 0.2759556770324707,
"learning_rate": 2.9122980029116586e-05,
"loss": 9.6039,
"step": 567
},
{
"epoch": 0.6812593703148426,
"grad_norm": 0.2814030647277832,
"learning_rate": 2.8928449696898763e-05,
"loss": 9.602,
"step": 568
},
{
"epoch": 0.6824587706146926,
"grad_norm": 0.2769099771976471,
"learning_rate": 2.8734306432994735e-05,
"loss": 9.6079,
"step": 569
},
{
"epoch": 0.6836581709145427,
"grad_norm": 0.2809275686740875,
"learning_rate": 2.8540553803668252e-05,
"loss": 9.613,
"step": 570
},
{
"epoch": 0.6848575712143928,
"grad_norm": 0.275016725063324,
"learning_rate": 2.8347195368007418e-05,
"loss": 9.6097,
"step": 571
},
{
"epoch": 0.6860569715142428,
"grad_norm": 0.2964610755443573,
"learning_rate": 2.815423467785925e-05,
"loss": 9.6111,
"step": 572
},
{
"epoch": 0.6872563718140929,
"grad_norm": 0.2884480059146881,
"learning_rate": 2.7961675277764498e-05,
"loss": 9.6089,
"step": 573
},
{
"epoch": 0.688455772113943,
"grad_norm": 0.30310893058776855,
"learning_rate": 2.7769520704892566e-05,
"loss": 9.6102,
"step": 574
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.4733683466911316,
"learning_rate": 2.757777448897646e-05,
"loss": 9.6083,
"step": 575
},
{
"epoch": 0.6908545727136431,
"grad_norm": 0.272512823343277,
"learning_rate": 2.7386440152247933e-05,
"loss": 9.5963,
"step": 576
},
{
"epoch": 0.6920539730134933,
"grad_norm": 0.2810138165950775,
"learning_rate": 2.71955212093729e-05,
"loss": 9.6012,
"step": 577
},
{
"epoch": 0.6932533733133434,
"grad_norm": 0.2755623161792755,
"learning_rate": 2.7005021167386803e-05,
"loss": 9.6022,
"step": 578
},
{
"epoch": 0.6944527736131934,
"grad_norm": 0.2718299329280853,
"learning_rate": 2.681494352563013e-05,
"loss": 9.6096,
"step": 579
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.2746315896511078,
"learning_rate": 2.6625291775684292e-05,
"loss": 9.6124,
"step": 580
},
{
"epoch": 0.6968515742128936,
"grad_norm": 0.2844776511192322,
"learning_rate": 2.6436069401307284e-05,
"loss": 9.6054,
"step": 581
},
{
"epoch": 0.6980509745127437,
"grad_norm": 0.2785060703754425,
"learning_rate": 2.624727987836991e-05,
"loss": 9.6112,
"step": 582
},
{
"epoch": 0.6992503748125937,
"grad_norm": 0.2840147316455841,
"learning_rate": 2.6058926674791728e-05,
"loss": 9.6061,
"step": 583
},
{
"epoch": 0.7004497751124438,
"grad_norm": 0.28523436188697815,
"learning_rate": 2.5871013250477528e-05,
"loss": 9.6057,
"step": 584
},
{
"epoch": 0.7016491754122939,
"grad_norm": 0.29284006357192993,
"learning_rate": 2.56835430572536e-05,
"loss": 9.6091,
"step": 585
},
{
"epoch": 0.7028485757121439,
"grad_norm": 0.29574641585350037,
"learning_rate": 2.5496519538804486e-05,
"loss": 9.6155,
"step": 586
},
{
"epoch": 0.704047976011994,
"grad_norm": 0.3032572269439697,
"learning_rate": 2.530994613060965e-05,
"loss": 9.6162,
"step": 587
},
{
"epoch": 0.7052473763118441,
"grad_norm": 0.2718828320503235,
"learning_rate": 2.5123826259880323e-05,
"loss": 9.6001,
"step": 588
},
{
"epoch": 0.7064467766116942,
"grad_norm": 0.27074381709098816,
"learning_rate": 2.493816334549664e-05,
"loss": 9.6014,
"step": 589
},
{
"epoch": 0.7076461769115442,
"grad_norm": 0.2791549265384674,
"learning_rate": 2.4752960797944802e-05,
"loss": 9.5998,
"step": 590
},
{
"epoch": 0.7088455772113943,
"grad_norm": 0.28340011835098267,
"learning_rate": 2.4568222019254377e-05,
"loss": 9.5979,
"step": 591
},
{
"epoch": 0.7100449775112444,
"grad_norm": 0.2762751579284668,
"learning_rate": 2.43839504029359e-05,
"loss": 9.6032,
"step": 592
},
{
"epoch": 0.7112443778110945,
"grad_norm": 0.2753763198852539,
"learning_rate": 2.4200149333918487e-05,
"loss": 9.6089,
"step": 593
},
{
"epoch": 0.7124437781109445,
"grad_norm": 0.27482444047927856,
"learning_rate": 2.4016822188487603e-05,
"loss": 9.6081,
"step": 594
},
{
"epoch": 0.7136431784107946,
"grad_norm": 0.28210797905921936,
"learning_rate": 2.383397233422318e-05,
"loss": 9.6041,
"step": 595
},
{
"epoch": 0.7148425787106447,
"grad_norm": 0.2853706479072571,
"learning_rate": 2.3651603129937592e-05,
"loss": 9.6042,
"step": 596
},
{
"epoch": 0.7160419790104947,
"grad_norm": 0.3066234886646271,
"learning_rate": 2.346971792561413e-05,
"loss": 9.6053,
"step": 597
},
{
"epoch": 0.7172413793103448,
"grad_norm": 0.2879929542541504,
"learning_rate": 2.3288320062345277e-05,
"loss": 9.6069,
"step": 598
},
{
"epoch": 0.7184407796101949,
"grad_norm": 0.35332369804382324,
"learning_rate": 2.3107412872271518e-05,
"loss": 9.6162,
"step": 599
},
{
"epoch": 0.719640179910045,
"grad_norm": 0.5152252316474915,
"learning_rate": 2.2926999678519974e-05,
"loss": 9.6182,
"step": 600
},
{
"epoch": 0.720839580209895,
"grad_norm": 0.2663346230983734,
"learning_rate": 2.274708379514348e-05,
"loss": 9.5986,
"step": 601
},
{
"epoch": 0.7220389805097451,
"grad_norm": 0.27524423599243164,
"learning_rate": 2.256766852705967e-05,
"loss": 9.5986,
"step": 602
},
{
"epoch": 0.7232383808095952,
"grad_norm": 0.2814219295978546,
"learning_rate": 2.238875716999019e-05,
"loss": 9.6037,
"step": 603
},
{
"epoch": 0.7244377811094452,
"grad_norm": 0.2859136760234833,
"learning_rate": 2.221035301040027e-05,
"loss": 9.6002,
"step": 604
},
{
"epoch": 0.7256371814092953,
"grad_norm": 0.27460747957229614,
"learning_rate": 2.2032459325438336e-05,
"loss": 9.6031,
"step": 605
},
{
"epoch": 0.7268365817091454,
"grad_norm": 0.2745445966720581,
"learning_rate": 2.185507938287572e-05,
"loss": 9.6072,
"step": 606
},
{
"epoch": 0.7280359820089956,
"grad_norm": 0.2816024124622345,
"learning_rate": 2.1678216441046734e-05,
"loss": 9.6128,
"step": 607
},
{
"epoch": 0.7292353823088455,
"grad_norm": 0.28734058141708374,
"learning_rate": 2.1501873748788802e-05,
"loss": 9.6127,
"step": 608
},
{
"epoch": 0.7304347826086957,
"grad_norm": 0.28445249795913696,
"learning_rate": 2.1326054545382695e-05,
"loss": 9.6118,
"step": 609
},
{
"epoch": 0.7316341829085458,
"grad_norm": 0.2825443148612976,
"learning_rate": 2.1150762060493155e-05,
"loss": 9.6182,
"step": 610
},
{
"epoch": 0.7328335832083958,
"grad_norm": 0.29409319162368774,
"learning_rate": 2.09759995141095e-05,
"loss": 9.611,
"step": 611
},
{
"epoch": 0.7340329835082459,
"grad_norm": 0.30348506569862366,
"learning_rate": 2.0801770116486447e-05,
"loss": 9.6193,
"step": 612
},
{
"epoch": 0.735232383808096,
"grad_norm": 0.2586905360221863,
"learning_rate": 2.0628077068085173e-05,
"loss": 9.6146,
"step": 613
},
{
"epoch": 0.7364317841079461,
"grad_norm": 0.27243587374687195,
"learning_rate": 2.0454923559514595e-05,
"loss": 9.6025,
"step": 614
},
{
"epoch": 0.7376311844077961,
"grad_norm": 0.27491042017936707,
"learning_rate": 2.028231277147261e-05,
"loss": 9.6013,
"step": 615
},
{
"epoch": 0.7388305847076462,
"grad_norm": 0.279153048992157,
"learning_rate": 2.0110247874687815e-05,
"loss": 9.5937,
"step": 616
},
{
"epoch": 0.7400299850074963,
"grad_norm": 0.27780649065971375,
"learning_rate": 1.993873202986119e-05,
"loss": 9.6022,
"step": 617
},
{
"epoch": 0.7412293853073463,
"grad_norm": 0.2798539698123932,
"learning_rate": 1.976776838760801e-05,
"loss": 9.6022,
"step": 618
},
{
"epoch": 0.7424287856071964,
"grad_norm": 0.27843162417411804,
"learning_rate": 1.9597360088400052e-05,
"loss": 9.6062,
"step": 619
},
{
"epoch": 0.7436281859070465,
"grad_norm": 0.27371302247047424,
"learning_rate": 1.9427510262507864e-05,
"loss": 9.6119,
"step": 620
},
{
"epoch": 0.7448275862068966,
"grad_norm": 0.2873663604259491,
"learning_rate": 1.925822202994323e-05,
"loss": 9.6004,
"step": 621
},
{
"epoch": 0.7460269865067466,
"grad_norm": 0.2875591218471527,
"learning_rate": 1.9089498500401914e-05,
"loss": 9.6119,
"step": 622
},
{
"epoch": 0.7472263868065967,
"grad_norm": 0.2853778004646301,
"learning_rate": 1.892134277320655e-05,
"loss": 9.6091,
"step": 623
},
{
"epoch": 0.7484257871064468,
"grad_norm": 0.2952004075050354,
"learning_rate": 1.87537579372496e-05,
"loss": 9.6182,
"step": 624
},
{
"epoch": 0.7496251874062968,
"grad_norm": 0.3686712980270386,
"learning_rate": 1.858674707093675e-05,
"loss": 9.614,
"step": 625
},
{
"epoch": 0.7508245877061469,
"grad_norm": 0.2664184868335724,
"learning_rate": 1.8420313242130293e-05,
"loss": 9.6005,
"step": 626
},
{
"epoch": 0.752023988005997,
"grad_norm": 0.2688407301902771,
"learning_rate": 1.8254459508092768e-05,
"loss": 9.5988,
"step": 627
},
{
"epoch": 0.7532233883058471,
"grad_norm": 0.2794104516506195,
"learning_rate": 1.8089188915430793e-05,
"loss": 9.5987,
"step": 628
},
{
"epoch": 0.7544227886056971,
"grad_norm": 0.26486334204673767,
"learning_rate": 1.792450450003919e-05,
"loss": 9.6129,
"step": 629
},
{
"epoch": 0.7556221889055472,
"grad_norm": 0.2762359082698822,
"learning_rate": 1.7760409287045078e-05,
"loss": 9.6052,
"step": 630
},
{
"epoch": 0.7568215892053973,
"grad_norm": 0.27764591574668884,
"learning_rate": 1.7596906290752425e-05,
"loss": 9.6056,
"step": 631
},
{
"epoch": 0.7580209895052473,
"grad_norm": 0.276153028011322,
"learning_rate": 1.743399851458663e-05,
"loss": 9.609,
"step": 632
},
{
"epoch": 0.7592203898050974,
"grad_norm": 0.2780199646949768,
"learning_rate": 1.727168895103931e-05,
"loss": 9.6081,
"step": 633
},
{
"epoch": 0.7604197901049475,
"grad_norm": 0.276457816362381,
"learning_rate": 1.7109980581613417e-05,
"loss": 9.6062,
"step": 634
},
{
"epoch": 0.7616191904047976,
"grad_norm": 0.2808220088481903,
"learning_rate": 1.6948876376768418e-05,
"loss": 9.6123,
"step": 635
},
{
"epoch": 0.7628185907046476,
"grad_norm": 0.29566583037376404,
"learning_rate": 1.6788379295865704e-05,
"loss": 9.6094,
"step": 636
},
{
"epoch": 0.7640179910044977,
"grad_norm": 0.33136534690856934,
"learning_rate": 1.6628492287114296e-05,
"loss": 9.614,
"step": 637
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.27251994609832764,
"learning_rate": 1.6469218287516664e-05,
"loss": 9.6011,
"step": 638
},
{
"epoch": 0.766416791604198,
"grad_norm": 0.2670121490955353,
"learning_rate": 1.6310560222814714e-05,
"loss": 9.6037,
"step": 639
},
{
"epoch": 0.767616191904048,
"grad_norm": 0.2792399227619171,
"learning_rate": 1.6152521007436145e-05,
"loss": 9.6036,
"step": 640
},
{
"epoch": 0.7688155922038981,
"grad_norm": 0.275511234998703,
"learning_rate": 1.599510354444087e-05,
"loss": 9.5973,
"step": 641
},
{
"epoch": 0.7700149925037482,
"grad_norm": 0.2751782536506653,
"learning_rate": 1.5838310725467644e-05,
"loss": 9.6005,
"step": 642
},
{
"epoch": 0.7712143928035982,
"grad_norm": 0.28111734986305237,
"learning_rate": 1.5682145430681027e-05,
"loss": 9.6015,
"step": 643
},
{
"epoch": 0.7724137931034483,
"grad_norm": 0.2826797068119049,
"learning_rate": 1.5526610528718415e-05,
"loss": 9.6054,
"step": 644
},
{
"epoch": 0.7736131934032984,
"grad_norm": 0.28505128622055054,
"learning_rate": 1.5371708876637354e-05,
"loss": 9.6042,
"step": 645
},
{
"epoch": 0.7748125937031485,
"grad_norm": 0.28200674057006836,
"learning_rate": 1.5217443319863112e-05,
"loss": 9.6051,
"step": 646
},
{
"epoch": 0.7760119940029985,
"grad_norm": 0.2859637439250946,
"learning_rate": 1.5063816692136373e-05,
"loss": 9.6004,
"step": 647
},
{
"epoch": 0.7772113943028486,
"grad_norm": 0.28504401445388794,
"learning_rate": 1.4910831815461123e-05,
"loss": 9.6177,
"step": 648
},
{
"epoch": 0.7784107946026987,
"grad_norm": 0.2949487268924713,
"learning_rate": 1.4758491500052924e-05,
"loss": 9.6204,
"step": 649
},
{
"epoch": 0.7796101949025487,
"grad_norm": 0.3952041268348694,
"learning_rate": 1.4606798544287243e-05,
"loss": 9.62,
"step": 650
},
{
"epoch": 0.7808095952023988,
"grad_norm": 0.2684868574142456,
"learning_rate": 1.445575573464799e-05,
"loss": 9.5986,
"step": 651
},
{
"epoch": 0.7820089955022489,
"grad_norm": 0.2751760184764862,
"learning_rate": 1.4305365845676439e-05,
"loss": 9.5993,
"step": 652
},
{
"epoch": 0.783208395802099,
"grad_norm": 0.27565452456474304,
"learning_rate": 1.4155631639920209e-05,
"loss": 9.5939,
"step": 653
},
{
"epoch": 0.784407796101949,
"grad_norm": 0.27967387437820435,
"learning_rate": 1.4006555867882464e-05,
"loss": 9.6024,
"step": 654
},
{
"epoch": 0.7856071964017991,
"grad_norm": 0.28178393840789795,
"learning_rate": 1.3858141267971491e-05,
"loss": 9.6057,
"step": 655
},
{
"epoch": 0.7868065967016492,
"grad_norm": 0.27983683347702026,
"learning_rate": 1.3710390566450366e-05,
"loss": 9.6059,
"step": 656
},
{
"epoch": 0.7880059970014992,
"grad_norm": 0.286726713180542,
"learning_rate": 1.3563306477386784e-05,
"loss": 9.6032,
"step": 657
},
{
"epoch": 0.7892053973013493,
"grad_norm": 0.2814926505088806,
"learning_rate": 1.3416891702603358e-05,
"loss": 9.6077,
"step": 658
},
{
"epoch": 0.7904047976011994,
"grad_norm": 0.291660875082016,
"learning_rate": 1.3271148931627858e-05,
"loss": 9.6055,
"step": 659
},
{
"epoch": 0.7916041979010495,
"grad_norm": 0.2863795757293701,
"learning_rate": 1.3126080841643856e-05,
"loss": 9.6111,
"step": 660
},
{
"epoch": 0.7928035982008995,
"grad_norm": 0.2854698896408081,
"learning_rate": 1.2981690097441573e-05,
"loss": 9.6172,
"step": 661
},
{
"epoch": 0.7940029985007496,
"grad_norm": 0.3119170367717743,
"learning_rate": 1.2837979351368912e-05,
"loss": 9.6102,
"step": 662
},
{
"epoch": 0.7952023988005997,
"grad_norm": 0.27526015043258667,
"learning_rate": 1.2694951243282683e-05,
"loss": 9.6006,
"step": 663
},
{
"epoch": 0.7964017991004497,
"grad_norm": 0.27086350321769714,
"learning_rate": 1.2552608400500199e-05,
"loss": 9.6,
"step": 664
},
{
"epoch": 0.7976011994002998,
"grad_norm": 0.2674426734447479,
"learning_rate": 1.2410953437750966e-05,
"loss": 9.599,
"step": 665
},
{
"epoch": 0.7988005997001499,
"grad_norm": 0.26960834860801697,
"learning_rate": 1.2269988957128636e-05,
"loss": 9.6059,
"step": 666
},
{
"epoch": 0.8,
"grad_norm": 0.27745890617370605,
"learning_rate": 1.212971754804324e-05,
"loss": 9.6046,
"step": 667
},
{
"epoch": 0.80119940029985,
"grad_norm": 0.2803892493247986,
"learning_rate": 1.1990141787173648e-05,
"loss": 9.6036,
"step": 668
},
{
"epoch": 0.8023988005997001,
"grad_norm": 0.2826705574989319,
"learning_rate": 1.1851264238420135e-05,
"loss": 9.6031,
"step": 669
},
{
"epoch": 0.8035982008995503,
"grad_norm": 0.28543218970298767,
"learning_rate": 1.1713087452857408e-05,
"loss": 9.6047,
"step": 670
},
{
"epoch": 0.8047976011994002,
"grad_norm": 0.2749161124229431,
"learning_rate": 1.1575613968687682e-05,
"loss": 9.6061,
"step": 671
},
{
"epoch": 0.8059970014992504,
"grad_norm": 0.2880239486694336,
"learning_rate": 1.1438846311194024e-05,
"loss": 9.607,
"step": 672
},
{
"epoch": 0.8071964017991005,
"grad_norm": 0.2794909179210663,
"learning_rate": 1.1302786992694048e-05,
"loss": 9.6098,
"step": 673
},
{
"epoch": 0.8083958020989506,
"grad_norm": 0.3027547299861908,
"learning_rate": 1.1167438512493683e-05,
"loss": 9.6116,
"step": 674
},
{
"epoch": 0.8095952023988006,
"grad_norm": 0.34445720911026,
"learning_rate": 1.1032803356841342e-05,
"loss": 9.6171,
"step": 675
},
{
"epoch": 0.8107946026986507,
"grad_norm": 0.2722347378730774,
"learning_rate": 1.0898883998882158e-05,
"loss": 9.601,
"step": 676
},
{
"epoch": 0.8119940029985008,
"grad_norm": 0.27299922704696655,
"learning_rate": 1.0765682898612656e-05,
"loss": 9.5976,
"step": 677
},
{
"epoch": 0.8131934032983508,
"grad_norm": 0.2737182080745697,
"learning_rate": 1.0633202502835494e-05,
"loss": 9.5965,
"step": 678
},
{
"epoch": 0.8143928035982009,
"grad_norm": 0.2752780020236969,
"learning_rate": 1.0501445245114522e-05,
"loss": 9.6009,
"step": 679
},
{
"epoch": 0.815592203898051,
"grad_norm": 0.2721465826034546,
"learning_rate": 1.0370413545730118e-05,
"loss": 9.6064,
"step": 680
},
{
"epoch": 0.8167916041979011,
"grad_norm": 0.2846396267414093,
"learning_rate": 1.0240109811634712e-05,
"loss": 9.5995,
"step": 681
},
{
"epoch": 0.8179910044977511,
"grad_norm": 0.28411293029785156,
"learning_rate": 1.0110536436408535e-05,
"loss": 9.5975,
"step": 682
},
{
"epoch": 0.8191904047976012,
"grad_norm": 0.2815098762512207,
"learning_rate": 9.9816958002157e-06,
"loss": 9.6078,
"step": 683
},
{
"epoch": 0.8203898050974513,
"grad_norm": 0.278131902217865,
"learning_rate": 9.853590269760493e-06,
"loss": 9.6143,
"step": 684
},
{
"epoch": 0.8215892053973014,
"grad_norm": 0.2930939197540283,
"learning_rate": 9.726222198243806e-06,
"loss": 9.6042,
"step": 685
},
{
"epoch": 0.8227886056971514,
"grad_norm": 0.2876308560371399,
"learning_rate": 9.599593925320016e-06,
"loss": 9.6187,
"step": 686
},
{
"epoch": 0.8239880059970015,
"grad_norm": 0.3398456573486328,
"learning_rate": 9.47370777705397e-06,
"loss": 9.6115,
"step": 687
},
{
"epoch": 0.8251874062968516,
"grad_norm": 0.28324592113494873,
"learning_rate": 9.348566065878217e-06,
"loss": 9.5972,
"step": 688
},
{
"epoch": 0.8263868065967016,
"grad_norm": 0.271178662776947,
"learning_rate": 9.224171090550571e-06,
"loss": 9.6004,
"step": 689
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.26743438839912415,
"learning_rate": 9.100525136111915e-06,
"loss": 9.604,
"step": 690
},
{
"epoch": 0.8287856071964018,
"grad_norm": 0.2741158604621887,
"learning_rate": 8.97763047384414e-06,
"loss": 9.6024,
"step": 691
},
{
"epoch": 0.8299850074962519,
"grad_norm": 0.2776412069797516,
"learning_rate": 8.855489361228496e-06,
"loss": 9.5996,
"step": 692
},
{
"epoch": 0.8311844077961019,
"grad_norm": 0.2762274742126465,
"learning_rate": 8.734104041904129e-06,
"loss": 9.6041,
"step": 693
},
{
"epoch": 0.832383808095952,
"grad_norm": 0.2758176624774933,
"learning_rate": 8.61347674562677e-06,
"loss": 9.6084,
"step": 694
},
{
"epoch": 0.8335832083958021,
"grad_norm": 0.28230342268943787,
"learning_rate": 8.4936096882279e-06,
"loss": 9.6047,
"step": 695
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.28801631927490234,
"learning_rate": 8.37450507157399e-06,
"loss": 9.6084,
"step": 696
},
{
"epoch": 0.8359820089955022,
"grad_norm": 0.289760559797287,
"learning_rate": 8.256165083526019e-06,
"loss": 9.6033,
"step": 697
},
{
"epoch": 0.8371814092953523,
"grad_norm": 0.29011571407318115,
"learning_rate": 8.138591897899345e-06,
"loss": 9.6161,
"step": 698
},
{
"epoch": 0.8383808095952024,
"grad_norm": 0.3083633780479431,
"learning_rate": 8.021787674423775e-06,
"loss": 9.6152,
"step": 699
},
{
"epoch": 0.8395802098950524,
"grad_norm": 0.36470380425453186,
"learning_rate": 7.905754558703803e-06,
"loss": 9.6132,
"step": 700
},
{
"epoch": 0.8407796101949025,
"grad_norm": 0.26850220561027527,
"learning_rate": 7.790494682179317e-06,
"loss": 9.5949,
"step": 701
},
{
"epoch": 0.8419790104947527,
"grad_norm": 0.2714633643627167,
"learning_rate": 7.676010162086388e-06,
"loss": 9.604,
"step": 702
},
{
"epoch": 0.8431784107946027,
"grad_norm": 0.2753824293613434,
"learning_rate": 7.56230310141835e-06,
"loss": 9.5993,
"step": 703
},
{
"epoch": 0.8443778110944528,
"grad_norm": 0.2757047116756439,
"learning_rate": 7.449375588887203e-06,
"loss": 9.5993,
"step": 704
},
{
"epoch": 0.8455772113943029,
"grad_norm": 0.27331098914146423,
"learning_rate": 7.337229698885279e-06,
"loss": 9.6088,
"step": 705
},
{
"epoch": 0.846776611694153,
"grad_norm": 0.2818980813026428,
"learning_rate": 7.225867491447053e-06,
"loss": 9.6,
"step": 706
},
{
"epoch": 0.847976011994003,
"grad_norm": 0.2784759998321533,
"learning_rate": 7.115291012211383e-06,
"loss": 9.6056,
"step": 707
},
{
"epoch": 0.8491754122938531,
"grad_norm": 0.2809768319129944,
"learning_rate": 7.005502292383898e-06,
"loss": 9.6092,
"step": 708
},
{
"epoch": 0.8503748125937032,
"grad_norm": 0.29430076479911804,
"learning_rate": 6.896503348699657e-06,
"loss": 9.6031,
"step": 709
},
{
"epoch": 0.8515742128935532,
"grad_norm": 0.28350192308425903,
"learning_rate": 6.788296183386162e-06,
"loss": 9.6105,
"step": 710
},
{
"epoch": 0.8527736131934033,
"grad_norm": 0.29121461510658264,
"learning_rate": 6.680882784126552e-06,
"loss": 9.6108,
"step": 711
},
{
"epoch": 0.8539730134932534,
"grad_norm": 0.3215639889240265,
"learning_rate": 6.5742651240230545e-06,
"loss": 9.6104,
"step": 712
},
{
"epoch": 0.8551724137931035,
"grad_norm": 0.27074047923088074,
"learning_rate": 6.46844516156081e-06,
"loss": 9.598,
"step": 713
},
{
"epoch": 0.8563718140929535,
"grad_norm": 0.2728975713253021,
"learning_rate": 6.363424840571869e-06,
"loss": 9.5965,
"step": 714
},
{
"epoch": 0.8575712143928036,
"grad_norm": 0.2756417393684387,
"learning_rate": 6.259206090199426e-06,
"loss": 9.6021,
"step": 715
},
{
"epoch": 0.8587706146926537,
"grad_norm": 0.28334730863571167,
"learning_rate": 6.155790824862484e-06,
"loss": 9.5923,
"step": 716
},
{
"epoch": 0.8599700149925037,
"grad_norm": 0.2780725359916687,
"learning_rate": 6.053180944220627e-06,
"loss": 9.5977,
"step": 717
},
{
"epoch": 0.8611694152923538,
"grad_norm": 0.2788305878639221,
"learning_rate": 5.951378333139118e-06,
"loss": 9.6104,
"step": 718
},
{
"epoch": 0.8623688155922039,
"grad_norm": 0.28093886375427246,
"learning_rate": 5.850384861654329e-06,
"loss": 9.5993,
"step": 719
},
{
"epoch": 0.863568215892054,
"grad_norm": 0.28586381673812866,
"learning_rate": 5.750202384939313e-06,
"loss": 9.6017,
"step": 720
},
{
"epoch": 0.864767616191904,
"grad_norm": 0.2862658202648163,
"learning_rate": 5.650832743269779e-06,
"loss": 9.6105,
"step": 721
},
{
"epoch": 0.8659670164917541,
"grad_norm": 0.29242414236068726,
"learning_rate": 5.552277761990294e-06,
"loss": 9.6003,
"step": 722
},
{
"epoch": 0.8671664167916042,
"grad_norm": 0.2869936525821686,
"learning_rate": 5.454539251480739e-06,
"loss": 9.6101,
"step": 723
},
{
"epoch": 0.8683658170914542,
"grad_norm": 0.30616676807403564,
"learning_rate": 5.3576190071230106e-06,
"loss": 9.6093,
"step": 724
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.49281755089759827,
"learning_rate": 5.2615188092681176e-06,
"loss": 9.6174,
"step": 725
},
{
"epoch": 0.8707646176911544,
"grad_norm": 0.27237847447395325,
"learning_rate": 5.166240423203428e-06,
"loss": 9.5972,
"step": 726
},
{
"epoch": 0.8719640179910045,
"grad_norm": 0.26910293102264404,
"learning_rate": 5.071785599120243e-06,
"loss": 9.6002,
"step": 727
},
{
"epoch": 0.8731634182908545,
"grad_norm": 0.28134414553642273,
"learning_rate": 4.978156072081669e-06,
"loss": 9.5976,
"step": 728
},
{
"epoch": 0.8743628185907046,
"grad_norm": 0.2796195149421692,
"learning_rate": 4.885353561990752e-06,
"loss": 9.6045,
"step": 729
},
{
"epoch": 0.8755622188905547,
"grad_norm": 0.2702418863773346,
"learning_rate": 4.793379773558815e-06,
"loss": 9.611,
"step": 730
},
{
"epoch": 0.8767616191904049,
"grad_norm": 0.27752310037612915,
"learning_rate": 4.7022363962742514e-06,
"loss": 9.6131,
"step": 731
},
{
"epoch": 0.8779610194902548,
"grad_norm": 0.27505457401275635,
"learning_rate": 4.6119251043714225e-06,
"loss": 9.6062,
"step": 732
},
{
"epoch": 0.879160419790105,
"grad_norm": 0.2814129590988159,
"learning_rate": 4.522447556799875e-06,
"loss": 9.6059,
"step": 733
},
{
"epoch": 0.8803598200899551,
"grad_norm": 0.28014951944351196,
"learning_rate": 4.433805397193969e-06,
"loss": 9.6106,
"step": 734
},
{
"epoch": 0.881559220389805,
"grad_norm": 0.2873791456222534,
"learning_rate": 4.3460002538425805e-06,
"loss": 9.6109,
"step": 735
},
{
"epoch": 0.8827586206896552,
"grad_norm": 0.297184020280838,
"learning_rate": 4.2590337396592406e-06,
"loss": 9.614,
"step": 736
},
{
"epoch": 0.8839580209895053,
"grad_norm": 0.3112678527832031,
"learning_rate": 4.172907452152519e-06,
"loss": 9.6144,
"step": 737
},
{
"epoch": 0.8851574212893554,
"grad_norm": 0.27748996019363403,
"learning_rate": 4.087622973396665e-06,
"loss": 9.6036,
"step": 738
},
{
"epoch": 0.8863568215892054,
"grad_norm": 0.268793523311615,
"learning_rate": 4.0031818700025095e-06,
"loss": 9.6002,
"step": 739
},
{
"epoch": 0.8875562218890555,
"grad_norm": 0.27348071336746216,
"learning_rate": 3.919585693088751e-06,
"loss": 9.5986,
"step": 740
},
{
"epoch": 0.8887556221889056,
"grad_norm": 0.2737883925437927,
"learning_rate": 3.836835978253433e-06,
"loss": 9.5968,
"step": 741
},
{
"epoch": 0.8899550224887556,
"grad_norm": 0.2754174768924713,
"learning_rate": 3.7549342455457216e-06,
"loss": 9.6053,
"step": 742
},
{
"epoch": 0.8911544227886057,
"grad_norm": 0.27495238184928894,
"learning_rate": 3.6738819994379945e-06,
"loss": 9.607,
"step": 743
},
{
"epoch": 0.8923538230884558,
"grad_norm": 0.2809472680091858,
"learning_rate": 3.593680728798238e-06,
"loss": 9.6063,
"step": 744
},
{
"epoch": 0.8935532233883059,
"grad_norm": 0.2831871211528778,
"learning_rate": 3.5143319068626225e-06,
"loss": 9.6096,
"step": 745
},
{
"epoch": 0.8947526236881559,
"grad_norm": 0.28572776913642883,
"learning_rate": 3.435836991208524e-06,
"loss": 9.606,
"step": 746
},
{
"epoch": 0.895952023988006,
"grad_norm": 0.2837792634963989,
"learning_rate": 3.35819742372771e-06,
"loss": 9.608,
"step": 747
},
{
"epoch": 0.8971514242878561,
"grad_norm": 0.2817115783691406,
"learning_rate": 3.2814146305998107e-06,
"loss": 9.6116,
"step": 748
},
{
"epoch": 0.8983508245877061,
"grad_norm": 0.3011699914932251,
"learning_rate": 3.2054900222662276e-06,
"loss": 9.6132,
"step": 749
},
{
"epoch": 0.8995502248875562,
"grad_norm": 0.342312753200531,
"learning_rate": 3.1304249934041017e-06,
"loss": 9.61,
"step": 750
},
{
"epoch": 0.9007496251874063,
"grad_norm": 0.27040937542915344,
"learning_rate": 3.0562209229008042e-06,
"loss": 9.5958,
"step": 751
},
{
"epoch": 0.9019490254872564,
"grad_norm": 0.2634391188621521,
"learning_rate": 2.982879173828523e-06,
"loss": 9.6023,
"step": 752
},
{
"epoch": 0.9031484257871064,
"grad_norm": 0.275547057390213,
"learning_rate": 2.9104010934192794e-06,
"loss": 9.6,
"step": 753
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.2732800841331482,
"learning_rate": 2.838788013040139e-06,
"loss": 9.6007,
"step": 754
},
{
"epoch": 0.9055472263868066,
"grad_norm": 0.27795758843421936,
"learning_rate": 2.768041248168801e-06,
"loss": 9.6015,
"step": 755
},
{
"epoch": 0.9067466266866566,
"grad_norm": 0.2714845836162567,
"learning_rate": 2.6981620983694057e-06,
"loss": 9.6031,
"step": 756
},
{
"epoch": 0.9079460269865067,
"grad_norm": 0.29261884093284607,
"learning_rate": 2.6291518472686404e-06,
"loss": 9.6028,
"step": 757
},
{
"epoch": 0.9091454272863568,
"grad_norm": 0.2895471453666687,
"learning_rate": 2.5610117625322118e-06,
"loss": 9.6029,
"step": 758
},
{
"epoch": 0.9103448275862069,
"grad_norm": 0.2894986867904663,
"learning_rate": 2.4937430958415278e-06,
"loss": 9.6058,
"step": 759
},
{
"epoch": 0.9115442278860569,
"grad_norm": 0.2817150950431824,
"learning_rate": 2.427347082870701e-06,
"loss": 9.6065,
"step": 760
},
{
"epoch": 0.912743628185907,
"grad_norm": 0.2893367111682892,
"learning_rate": 2.361824943263874e-06,
"loss": 9.6136,
"step": 761
},
{
"epoch": 0.9139430284857571,
"grad_norm": 0.3113311529159546,
"learning_rate": 2.2971778806127996e-06,
"loss": 9.6116,
"step": 762
},
{
"epoch": 0.9151424287856071,
"grad_norm": 0.26471975445747375,
"learning_rate": 2.233407082434724e-06,
"loss": 9.608,
"step": 763
},
{
"epoch": 0.9163418290854572,
"grad_norm": 0.2689943015575409,
"learning_rate": 2.1705137201505965e-06,
"loss": 9.6016,
"step": 764
},
{
"epoch": 0.9175412293853074,
"grad_norm": 0.2714982032775879,
"learning_rate": 2.1084989490635255e-06,
"loss": 9.5975,
"step": 765
},
{
"epoch": 0.9187406296851575,
"grad_norm": 0.2796451151371002,
"learning_rate": 2.0473639083375795e-06,
"loss": 9.6013,
"step": 766
},
{
"epoch": 0.9199400299850075,
"grad_norm": 0.2697984278202057,
"learning_rate": 1.9871097209768375e-06,
"loss": 9.6081,
"step": 767
},
{
"epoch": 0.9211394302848576,
"grad_norm": 0.2762463092803955,
"learning_rate": 1.9277374938047988e-06,
"loss": 9.6034,
"step": 768
},
{
"epoch": 0.9223388305847077,
"grad_norm": 0.28663188219070435,
"learning_rate": 1.8692483174439946e-06,
"loss": 9.5996,
"step": 769
},
{
"epoch": 0.9235382308845578,
"grad_norm": 0.28348681330680847,
"learning_rate": 1.8116432662960037e-06,
"loss": 9.6014,
"step": 770
},
{
"epoch": 0.9247376311844078,
"grad_norm": 0.2859058976173401,
"learning_rate": 1.7549233985217074e-06,
"loss": 9.6014,
"step": 771
},
{
"epoch": 0.9259370314842579,
"grad_norm": 0.2842879295349121,
"learning_rate": 1.6990897560218211e-06,
"loss": 9.6047,
"step": 772
},
{
"epoch": 0.927136431784108,
"grad_norm": 0.289318323135376,
"learning_rate": 1.644143364417794e-06,
"loss": 9.6067,
"step": 773
},
{
"epoch": 0.928335832083958,
"grad_norm": 0.29014360904693604,
"learning_rate": 1.5900852330329563e-06,
"loss": 9.6226,
"step": 774
},
{
"epoch": 0.9295352323838081,
"grad_norm": 0.3719955384731293,
"learning_rate": 1.5369163548739462e-06,
"loss": 9.6146,
"step": 775
},
{
"epoch": 0.9307346326836582,
"grad_norm": 0.2645496428012848,
"learning_rate": 1.484637706612535e-06,
"loss": 9.6015,
"step": 776
},
{
"epoch": 0.9319340329835083,
"grad_norm": 0.27676111459732056,
"learning_rate": 1.4332502485676358e-06,
"loss": 9.6031,
"step": 777
},
{
"epoch": 0.9331334332833583,
"grad_norm": 0.26751890778541565,
"learning_rate": 1.3827549246876625e-06,
"loss": 9.6031,
"step": 778
},
{
"epoch": 0.9343328335832084,
"grad_norm": 0.26769477128982544,
"learning_rate": 1.333152662533227e-06,
"loss": 9.6071,
"step": 779
},
{
"epoch": 0.9355322338830585,
"grad_norm": 0.2727934420108795,
"learning_rate": 1.2844443732600576e-06,
"loss": 9.6092,
"step": 780
},
{
"epoch": 0.9367316341829085,
"grad_norm": 0.2800491452217102,
"learning_rate": 1.2366309516022966e-06,
"loss": 9.6069,
"step": 781
},
{
"epoch": 0.9379310344827586,
"grad_norm": 0.27642175555229187,
"learning_rate": 1.189713275856047e-06,
"loss": 9.6083,
"step": 782
},
{
"epoch": 0.9391304347826087,
"grad_norm": 0.2824404239654541,
"learning_rate": 1.1436922078632394e-06,
"loss": 9.6075,
"step": 783
},
{
"epoch": 0.9403298350824588,
"grad_norm": 0.28561386466026306,
"learning_rate": 1.0985685929958134e-06,
"loss": 9.607,
"step": 784
},
{
"epoch": 0.9415292353823088,
"grad_norm": 0.29175078868865967,
"learning_rate": 1.0543432601401615e-06,
"loss": 9.6059,
"step": 785
},
{
"epoch": 0.9427286356821589,
"grad_norm": 0.29381459951400757,
"learning_rate": 1.0110170216819316e-06,
"loss": 9.6138,
"step": 786
},
{
"epoch": 0.943928035982009,
"grad_norm": 0.30094021558761597,
"learning_rate": 9.685906734910988e-07,
"loss": 9.6111,
"step": 787
},
{
"epoch": 0.945127436281859,
"grad_norm": 0.27452078461647034,
"learning_rate": 9.270649949073229e-07,
"loss": 9.5987,
"step": 788
},
{
"epoch": 0.9463268365817091,
"grad_norm": 0.26897814869880676,
"learning_rate": 8.864407487256699e-07,
"loss": 9.6044,
"step": 789
},
{
"epoch": 0.9475262368815592,
"grad_norm": 0.27936410903930664,
"learning_rate": 8.467186811825623e-07,
"loss": 9.5946,
"step": 790
},
{
"epoch": 0.9487256371814093,
"grad_norm": 0.279367595911026,
"learning_rate": 8.07899521942096e-07,
"loss": 9.6049,
"step": 791
},
{
"epoch": 0.9499250374812593,
"grad_norm": 0.2792900502681732,
"learning_rate": 7.69983984082634e-07,
"loss": 9.6013,
"step": 792
},
{
"epoch": 0.9511244377811094,
"grad_norm": 0.27940914034843445,
"learning_rate": 7.329727640837058e-07,
"loss": 9.6057,
"step": 793
},
{
"epoch": 0.9523238380809596,
"grad_norm": 0.2867107391357422,
"learning_rate": 6.968665418131848e-07,
"loss": 9.6074,
"step": 794
},
{
"epoch": 0.9535232383808095,
"grad_norm": 0.28246691823005676,
"learning_rate": 6.616659805148695e-07,
"loss": 9.6092,
"step": 795
},
{
"epoch": 0.9547226386806597,
"grad_norm": 0.2778690755367279,
"learning_rate": 6.273717267962164e-07,
"loss": 9.612,
"step": 796
},
{
"epoch": 0.9559220389805098,
"grad_norm": 0.2856720983982086,
"learning_rate": 5.93984410616527e-07,
"loss": 9.6044,
"step": 797
},
{
"epoch": 0.9571214392803599,
"grad_norm": 0.30416610836982727,
"learning_rate": 5.615046452753403e-07,
"loss": 9.6137,
"step": 798
},
{
"epoch": 0.9583208395802099,
"grad_norm": 0.2967727780342102,
"learning_rate": 5.299330274011916e-07,
"loss": 9.6139,
"step": 799
},
{
"epoch": 0.95952023988006,
"grad_norm": 0.33488917350769043,
"learning_rate": 4.992701369406161e-07,
"loss": 9.6133,
"step": 800
},
{
"epoch": 0.9607196401799101,
"grad_norm": 0.2625320851802826,
"learning_rate": 4.695165371475463e-07,
"loss": 9.6024,
"step": 801
},
{
"epoch": 0.9619190404797601,
"grad_norm": 0.27328136563301086,
"learning_rate": 4.4067277457292556e-07,
"loss": 9.5989,
"step": 802
},
{
"epoch": 0.9631184407796102,
"grad_norm": 0.27370065450668335,
"learning_rate": 4.1273937905467185e-07,
"loss": 9.6009,
"step": 803
},
{
"epoch": 0.9643178410794603,
"grad_norm": 0.27158039808273315,
"learning_rate": 3.8571686370797443e-07,
"loss": 9.6003,
"step": 804
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.2797560691833496,
"learning_rate": 3.5960572491583466e-07,
"loss": 9.6009,
"step": 805
},
{
"epoch": 0.9667166416791604,
"grad_norm": 0.27934470772743225,
"learning_rate": 3.3440644231995664e-07,
"loss": 9.6051,
"step": 806
},
{
"epoch": 0.9679160419790105,
"grad_norm": 0.2808217406272888,
"learning_rate": 3.101194788119599e-07,
"loss": 9.606,
"step": 807
},
{
"epoch": 0.9691154422788606,
"grad_norm": 0.2805561423301697,
"learning_rate": 2.867452805248416e-07,
"loss": 9.6049,
"step": 808
},
{
"epoch": 0.9703148425787106,
"grad_norm": 0.2780965566635132,
"learning_rate": 2.642842768248055e-07,
"loss": 9.6102,
"step": 809
},
{
"epoch": 0.9715142428785607,
"grad_norm": 0.28727665543556213,
"learning_rate": 2.4273688030336805e-07,
"loss": 9.6085,
"step": 810
},
{
"epoch": 0.9727136431784108,
"grad_norm": 0.29147574305534363,
"learning_rate": 2.2210348676977023e-07,
"loss": 9.6056,
"step": 811
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.30791252851486206,
"learning_rate": 2.0238447524372205e-07,
"loss": 9.6104,
"step": 812
},
{
"epoch": 0.9751124437781109,
"grad_norm": 0.27840015292167664,
"learning_rate": 1.8358020794843056e-07,
"loss": 9.5996,
"step": 813
},
{
"epoch": 0.976311844077961,
"grad_norm": 0.26889273524284363,
"learning_rate": 1.6569103030394938e-07,
"loss": 9.6008,
"step": 814
},
{
"epoch": 0.9775112443778111,
"grad_norm": 0.2736169397830963,
"learning_rate": 1.48717270920834e-07,
"loss": 9.5996,
"step": 815
},
{
"epoch": 0.9787106446776612,
"grad_norm": 0.27559977769851685,
"learning_rate": 1.3265924159410192e-07,
"loss": 9.5988,
"step": 816
},
{
"epoch": 0.9799100449775112,
"grad_norm": 0.27969279885292053,
"learning_rate": 1.1751723729750974e-07,
"loss": 9.5987,
"step": 817
},
{
"epoch": 0.9811094452773613,
"grad_norm": 0.28211551904678345,
"learning_rate": 1.0329153617812947e-07,
"loss": 9.5975,
"step": 818
},
{
"epoch": 0.9823088455772114,
"grad_norm": 0.27674898505210876,
"learning_rate": 8.998239955124721e-08,
"loss": 9.6071,
"step": 819
},
{
"epoch": 0.9835082458770614,
"grad_norm": 0.27468326687812805,
"learning_rate": 7.759007189555579e-08,
"loss": 9.6077,
"step": 820
},
{
"epoch": 0.9847076461769115,
"grad_norm": 0.2778529226779938,
"learning_rate": 6.611478084866951e-08,
"loss": 9.6102,
"step": 821
},
{
"epoch": 0.9859070464767616,
"grad_norm": 0.27971234917640686,
"learning_rate": 5.555673720292753e-08,
"loss": 9.6097,
"step": 822
},
{
"epoch": 0.9871064467766117,
"grad_norm": 0.29513809084892273,
"learning_rate": 4.5916134901552443e-08,
"loss": 9.6067,
"step": 823
},
{
"epoch": 0.9883058470764617,
"grad_norm": 0.2978692352771759,
"learning_rate": 3.7193151035047616e-08,
"loss": 9.6096,
"step": 824
},
{
"epoch": 0.9895052473763118,
"grad_norm": 0.46041354537010193,
"learning_rate": 2.93879458379831e-08,
"loss": 9.615,
"step": 825
},
{
"epoch": 0.990704647676162,
"grad_norm": 0.270831823348999,
"learning_rate": 2.2500662686025797e-08,
"loss": 9.5977,
"step": 826
},
{
"epoch": 0.991904047976012,
"grad_norm": 0.2735693156719208,
"learning_rate": 1.653142809331376e-08,
"loss": 9.6035,
"step": 827
},
{
"epoch": 0.993103448275862,
"grad_norm": 0.27037888765335083,
"learning_rate": 1.148035171014139e-08,
"loss": 9.6053,
"step": 828
},
{
"epoch": 0.9943028485757122,
"grad_norm": 0.27673718333244324,
"learning_rate": 7.347526320927723e-09,
"loss": 9.6065,
"step": 829
},
{
"epoch": 0.9955022488755623,
"grad_norm": 0.29478275775909424,
"learning_rate": 4.133027842517789e-09,
"loss": 9.605,
"step": 830
},
{
"epoch": 0.9967016491754123,
"grad_norm": 0.2775980830192566,
"learning_rate": 1.8369153228114944e-09,
"loss": 9.6047,
"step": 831
},
{
"epoch": 0.9979010494752624,
"grad_norm": 0.27957436442375183,
"learning_rate": 4.5923093963118335e-10,
"loss": 9.6086,
"step": 832
},
{
"epoch": 0.9991004497751125,
"grad_norm": 0.2953091561794281,
"learning_rate": 0.0,
"loss": 9.6163,
"step": 833
}
],
"logging_steps": 1,
"max_steps": 833,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 239,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 691668038713344.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}