tunknowed's picture
Upload 11 files
416686a verified
raw
history blame
176 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15600928255231186,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00015600928255231187,
"grad_norm": 0.5937075614929199,
"learning_rate": 2.666666666666667e-06,
"loss": 3.7681,
"step": 1
},
{
"epoch": 0.00031201856510462375,
"grad_norm": 0.5836828947067261,
"learning_rate": 5.333333333333334e-06,
"loss": 3.8601,
"step": 2
},
{
"epoch": 0.00046802784765693557,
"grad_norm": 0.6111788749694824,
"learning_rate": 8.000000000000001e-06,
"loss": 3.643,
"step": 3
},
{
"epoch": 0.0006240371302092475,
"grad_norm": 0.5807424783706665,
"learning_rate": 1.0666666666666667e-05,
"loss": 3.724,
"step": 4
},
{
"epoch": 0.0007800464127615594,
"grad_norm": 0.5708947777748108,
"learning_rate": 1.3333333333333333e-05,
"loss": 3.9727,
"step": 5
},
{
"epoch": 0.0009360556953138711,
"grad_norm": 0.5662252902984619,
"learning_rate": 1.6000000000000003e-05,
"loss": 3.6801,
"step": 6
},
{
"epoch": 0.001092064977866183,
"grad_norm": 0.5653729438781738,
"learning_rate": 1.866666666666667e-05,
"loss": 3.6898,
"step": 7
},
{
"epoch": 0.001248074260418495,
"grad_norm": 0.5451233983039856,
"learning_rate": 2.1333333333333335e-05,
"loss": 3.5484,
"step": 8
},
{
"epoch": 0.0014040835429708068,
"grad_norm": 0.5682435035705566,
"learning_rate": 2.4e-05,
"loss": 3.538,
"step": 9
},
{
"epoch": 0.0015600928255231187,
"grad_norm": 0.6180667877197266,
"learning_rate": 2.6666666666666667e-05,
"loss": 3.6757,
"step": 10
},
{
"epoch": 0.0017161021080754305,
"grad_norm": 0.6358373165130615,
"learning_rate": 2.9333333333333336e-05,
"loss": 3.6489,
"step": 11
},
{
"epoch": 0.0018721113906277423,
"grad_norm": 0.6643233895301819,
"learning_rate": 3.2000000000000005e-05,
"loss": 3.4313,
"step": 12
},
{
"epoch": 0.0020281206731800542,
"grad_norm": 0.6591399908065796,
"learning_rate": 3.466666666666667e-05,
"loss": 3.472,
"step": 13
},
{
"epoch": 0.002184129955732366,
"grad_norm": 0.8929205536842346,
"learning_rate": 3.733333333333334e-05,
"loss": 3.3531,
"step": 14
},
{
"epoch": 0.0023401392382846778,
"grad_norm": 1.2845464944839478,
"learning_rate": 4e-05,
"loss": 3.143,
"step": 15
},
{
"epoch": 0.00249614852083699,
"grad_norm": 1.216373085975647,
"learning_rate": 4.266666666666667e-05,
"loss": 3.1297,
"step": 16
},
{
"epoch": 0.0026521578033893017,
"grad_norm": 0.9192391633987427,
"learning_rate": 4.5333333333333335e-05,
"loss": 2.9826,
"step": 17
},
{
"epoch": 0.0028081670859416135,
"grad_norm": 0.8917486667633057,
"learning_rate": 4.8e-05,
"loss": 2.9068,
"step": 18
},
{
"epoch": 0.0029641763684939253,
"grad_norm": 0.7141512632369995,
"learning_rate": 5.0666666666666674e-05,
"loss": 2.7797,
"step": 19
},
{
"epoch": 0.0031201856510462375,
"grad_norm": 0.8795380592346191,
"learning_rate": 5.333333333333333e-05,
"loss": 2.2265,
"step": 20
},
{
"epoch": 0.0032761949335985492,
"grad_norm": 1.047784447669983,
"learning_rate": 5.6000000000000006e-05,
"loss": 2.6089,
"step": 21
},
{
"epoch": 0.003432204216150861,
"grad_norm": 1.0959978103637695,
"learning_rate": 5.866666666666667e-05,
"loss": 2.3416,
"step": 22
},
{
"epoch": 0.0035882134987031728,
"grad_norm": 1.283445954322815,
"learning_rate": 6.133333333333334e-05,
"loss": 2.0565,
"step": 23
},
{
"epoch": 0.0037442227812554845,
"grad_norm": 1.655569314956665,
"learning_rate": 6.400000000000001e-05,
"loss": 1.6648,
"step": 24
},
{
"epoch": 0.0039002320638077967,
"grad_norm": 1.4048818349838257,
"learning_rate": 6.666666666666667e-05,
"loss": 1.7566,
"step": 25
},
{
"epoch": 0.0040562413463601085,
"grad_norm": 1.0755441188812256,
"learning_rate": 6.933333333333334e-05,
"loss": 1.6591,
"step": 26
},
{
"epoch": 0.00421225062891242,
"grad_norm": 0.7240940928459167,
"learning_rate": 7.2e-05,
"loss": 2.21,
"step": 27
},
{
"epoch": 0.004368259911464732,
"grad_norm": 0.48980680108070374,
"learning_rate": 7.466666666666667e-05,
"loss": 1.8157,
"step": 28
},
{
"epoch": 0.004524269194017044,
"grad_norm": 0.4145239591598511,
"learning_rate": 7.733333333333333e-05,
"loss": 1.8679,
"step": 29
},
{
"epoch": 0.0046802784765693555,
"grad_norm": 0.3905705213546753,
"learning_rate": 8e-05,
"loss": 1.5733,
"step": 30
},
{
"epoch": 0.004836287759121667,
"grad_norm": 0.40969792008399963,
"learning_rate": 8.266666666666667e-05,
"loss": 1.531,
"step": 31
},
{
"epoch": 0.00499229704167398,
"grad_norm": 0.4269125759601593,
"learning_rate": 8.533333333333334e-05,
"loss": 1.3705,
"step": 32
},
{
"epoch": 0.005148306324226292,
"grad_norm": 0.5876020789146423,
"learning_rate": 8.800000000000001e-05,
"loss": 1.4055,
"step": 33
},
{
"epoch": 0.0053043156067786035,
"grad_norm": 0.39753037691116333,
"learning_rate": 9.066666666666667e-05,
"loss": 1.5403,
"step": 34
},
{
"epoch": 0.005460324889330915,
"grad_norm": 0.4157419800758362,
"learning_rate": 9.333333333333334e-05,
"loss": 1.5698,
"step": 35
},
{
"epoch": 0.005616334171883227,
"grad_norm": 0.4430864155292511,
"learning_rate": 9.6e-05,
"loss": 1.5466,
"step": 36
},
{
"epoch": 0.005772343454435539,
"grad_norm": 0.5259338021278381,
"learning_rate": 9.866666666666668e-05,
"loss": 1.5878,
"step": 37
},
{
"epoch": 0.0059283527369878505,
"grad_norm": 0.4409235417842865,
"learning_rate": 0.00010133333333333335,
"loss": 1.4203,
"step": 38
},
{
"epoch": 0.006084362019540162,
"grad_norm": 0.5432307124137878,
"learning_rate": 0.00010400000000000001,
"loss": 1.3843,
"step": 39
},
{
"epoch": 0.006240371302092475,
"grad_norm": 0.5789123177528381,
"learning_rate": 0.00010666666666666667,
"loss": 1.4649,
"step": 40
},
{
"epoch": 0.006396380584644787,
"grad_norm": 0.5596875548362732,
"learning_rate": 0.00010933333333333333,
"loss": 1.3569,
"step": 41
},
{
"epoch": 0.0065523898671970985,
"grad_norm": 0.6517161726951599,
"learning_rate": 0.00011200000000000001,
"loss": 1.4306,
"step": 42
},
{
"epoch": 0.00670839914974941,
"grad_norm": 0.7665486335754395,
"learning_rate": 0.00011466666666666667,
"loss": 1.5369,
"step": 43
},
{
"epoch": 0.006864408432301722,
"grad_norm": 0.8421632647514343,
"learning_rate": 0.00011733333333333334,
"loss": 1.5651,
"step": 44
},
{
"epoch": 0.007020417714854034,
"grad_norm": 0.8437005877494812,
"learning_rate": 0.00012,
"loss": 1.33,
"step": 45
},
{
"epoch": 0.0071764269974063455,
"grad_norm": 0.3544560670852661,
"learning_rate": 0.00012266666666666668,
"loss": 1.5436,
"step": 46
},
{
"epoch": 0.007332436279958657,
"grad_norm": 0.35725343227386475,
"learning_rate": 0.00012533333333333334,
"loss": 1.5792,
"step": 47
},
{
"epoch": 0.007488445562510969,
"grad_norm": 0.7203790545463562,
"learning_rate": 0.00012800000000000002,
"loss": 1.6707,
"step": 48
},
{
"epoch": 0.007644454845063282,
"grad_norm": 0.352791428565979,
"learning_rate": 0.00013066666666666668,
"loss": 1.5251,
"step": 49
},
{
"epoch": 0.0078004641276155934,
"grad_norm": 0.49014368653297424,
"learning_rate": 0.00013333333333333334,
"loss": 1.3478,
"step": 50
},
{
"epoch": 0.007956473410167905,
"grad_norm": 0.29890525341033936,
"learning_rate": 0.00013600000000000003,
"loss": 1.3164,
"step": 51
},
{
"epoch": 0.008112482692720217,
"grad_norm": 0.34632885456085205,
"learning_rate": 0.00013866666666666669,
"loss": 1.3785,
"step": 52
},
{
"epoch": 0.008268491975272529,
"grad_norm": 0.3631187677383423,
"learning_rate": 0.00014133333333333334,
"loss": 1.7873,
"step": 53
},
{
"epoch": 0.00842450125782484,
"grad_norm": 0.290487140417099,
"learning_rate": 0.000144,
"loss": 1.1681,
"step": 54
},
{
"epoch": 0.008580510540377152,
"grad_norm": 0.3136501610279083,
"learning_rate": 0.00014666666666666666,
"loss": 1.1332,
"step": 55
},
{
"epoch": 0.008736519822929464,
"grad_norm": 0.3708946406841278,
"learning_rate": 0.00014933333333333335,
"loss": 1.4316,
"step": 56
},
{
"epoch": 0.008892529105481776,
"grad_norm": 0.3645316958427429,
"learning_rate": 0.000152,
"loss": 1.3522,
"step": 57
},
{
"epoch": 0.009048538388034088,
"grad_norm": 0.4074520170688629,
"learning_rate": 0.00015466666666666667,
"loss": 1.5344,
"step": 58
},
{
"epoch": 0.0092045476705864,
"grad_norm": 0.3106740713119507,
"learning_rate": 0.00015733333333333333,
"loss": 1.2959,
"step": 59
},
{
"epoch": 0.009360556953138711,
"grad_norm": 0.32623976469039917,
"learning_rate": 0.00016,
"loss": 1.6472,
"step": 60
},
{
"epoch": 0.009516566235691023,
"grad_norm": 0.35396724939346313,
"learning_rate": 0.00016266666666666667,
"loss": 1.2655,
"step": 61
},
{
"epoch": 0.009672575518243335,
"grad_norm": 0.3455830216407776,
"learning_rate": 0.00016533333333333333,
"loss": 1.2153,
"step": 62
},
{
"epoch": 0.009828584800795648,
"grad_norm": 0.3116808235645294,
"learning_rate": 0.000168,
"loss": 1.0851,
"step": 63
},
{
"epoch": 0.00998459408334796,
"grad_norm": 0.3416989743709564,
"learning_rate": 0.00017066666666666668,
"loss": 1.5828,
"step": 64
},
{
"epoch": 0.010140603365900272,
"grad_norm": 0.3509654104709625,
"learning_rate": 0.00017333333333333334,
"loss": 1.4832,
"step": 65
},
{
"epoch": 0.010296612648452583,
"grad_norm": 0.3034147322177887,
"learning_rate": 0.00017600000000000002,
"loss": 1.4326,
"step": 66
},
{
"epoch": 0.010452621931004895,
"grad_norm": 0.3084355890750885,
"learning_rate": 0.00017866666666666668,
"loss": 1.2452,
"step": 67
},
{
"epoch": 0.010608631213557207,
"grad_norm": 0.3001956343650818,
"learning_rate": 0.00018133333333333334,
"loss": 1.2484,
"step": 68
},
{
"epoch": 0.010764640496109519,
"grad_norm": 0.30605360865592957,
"learning_rate": 0.00018400000000000003,
"loss": 1.2137,
"step": 69
},
{
"epoch": 0.01092064977866183,
"grad_norm": 0.32967764139175415,
"learning_rate": 0.0001866666666666667,
"loss": 1.4,
"step": 70
},
{
"epoch": 0.011076659061214142,
"grad_norm": 0.3161776661872864,
"learning_rate": 0.00018933333333333335,
"loss": 1.3203,
"step": 71
},
{
"epoch": 0.011232668343766454,
"grad_norm": 0.28808867931365967,
"learning_rate": 0.000192,
"loss": 1.3034,
"step": 72
},
{
"epoch": 0.011388677626318766,
"grad_norm": 0.2804367244243622,
"learning_rate": 0.0001946666666666667,
"loss": 1.2753,
"step": 73
},
{
"epoch": 0.011544686908871078,
"grad_norm": 0.30980467796325684,
"learning_rate": 0.00019733333333333335,
"loss": 1.3733,
"step": 74
},
{
"epoch": 0.01170069619142339,
"grad_norm": 0.31240588426589966,
"learning_rate": 0.0002,
"loss": 1.1602,
"step": 75
},
{
"epoch": 0.011856705473975701,
"grad_norm": 0.28906041383743286,
"learning_rate": 0.00019999991608372393,
"loss": 1.3243,
"step": 76
},
{
"epoch": 0.012012714756528013,
"grad_norm": 0.2740985155105591,
"learning_rate": 0.00019999966433503652,
"loss": 1.1853,
"step": 77
},
{
"epoch": 0.012168724039080325,
"grad_norm": 0.30425482988357544,
"learning_rate": 0.0001999992447543603,
"loss": 1.3282,
"step": 78
},
{
"epoch": 0.012324733321632636,
"grad_norm": 0.3216018080711365,
"learning_rate": 0.00019999865734239946,
"loss": 1.3696,
"step": 79
},
{
"epoch": 0.01248074260418495,
"grad_norm": 0.34770438075065613,
"learning_rate": 0.00019999790210013988,
"loss": 1.261,
"step": 80
},
{
"epoch": 0.012636751886737262,
"grad_norm": 0.3883892297744751,
"learning_rate": 0.0001999969790288491,
"loss": 1.5873,
"step": 81
},
{
"epoch": 0.012792761169289573,
"grad_norm": 0.3061410188674927,
"learning_rate": 0.00019999588813007633,
"loss": 1.5559,
"step": 82
},
{
"epoch": 0.012948770451841885,
"grad_norm": 0.3044775128364563,
"learning_rate": 0.00019999462940565243,
"loss": 1.2439,
"step": 83
},
{
"epoch": 0.013104779734394197,
"grad_norm": 0.3562803864479065,
"learning_rate": 0.00019999320285769,
"loss": 1.4121,
"step": 84
},
{
"epoch": 0.013260789016946509,
"grad_norm": 0.3367731273174286,
"learning_rate": 0.0001999916084885832,
"loss": 1.1937,
"step": 85
},
{
"epoch": 0.01341679829949882,
"grad_norm": 0.3613661527633667,
"learning_rate": 0.00019998984630100792,
"loss": 1.4267,
"step": 86
},
{
"epoch": 0.013572807582051132,
"grad_norm": 0.30924999713897705,
"learning_rate": 0.0001999879162979217,
"loss": 1.3358,
"step": 87
},
{
"epoch": 0.013728816864603444,
"grad_norm": 0.34925562143325806,
"learning_rate": 0.0001999858184825637,
"loss": 1.3758,
"step": 88
},
{
"epoch": 0.013884826147155756,
"grad_norm": 0.3182036280632019,
"learning_rate": 0.00019998355285845475,
"loss": 1.3151,
"step": 89
},
{
"epoch": 0.014040835429708068,
"grad_norm": 0.6028950810432434,
"learning_rate": 0.0001999811194293973,
"loss": 1.3797,
"step": 90
},
{
"epoch": 0.01419684471226038,
"grad_norm": 0.3221015930175781,
"learning_rate": 0.00019997851819947537,
"loss": 1.3293,
"step": 91
},
{
"epoch": 0.014352853994812691,
"grad_norm": 0.3003532290458679,
"learning_rate": 0.00019997574917305478,
"loss": 1.5671,
"step": 92
},
{
"epoch": 0.014508863277365003,
"grad_norm": 0.32144418358802795,
"learning_rate": 0.00019997281235478278,
"loss": 1.3733,
"step": 93
},
{
"epoch": 0.014664872559917315,
"grad_norm": 1.3427015542984009,
"learning_rate": 0.00019996970774958836,
"loss": 1.246,
"step": 94
},
{
"epoch": 0.014820881842469626,
"grad_norm": 0.3254302144050598,
"learning_rate": 0.00019996643536268204,
"loss": 1.3829,
"step": 95
},
{
"epoch": 0.014976891125021938,
"grad_norm": 0.2829325795173645,
"learning_rate": 0.0001999629951995559,
"loss": 1.2176,
"step": 96
},
{
"epoch": 0.01513290040757425,
"grad_norm": 0.2943004071712494,
"learning_rate": 0.00019995938726598373,
"loss": 1.4021,
"step": 97
},
{
"epoch": 0.015288909690126563,
"grad_norm": 0.2698727548122406,
"learning_rate": 0.00019995561156802079,
"loss": 1.2897,
"step": 98
},
{
"epoch": 0.015444918972678875,
"grad_norm": 0.32416194677352905,
"learning_rate": 0.0001999516681120039,
"loss": 1.218,
"step": 99
},
{
"epoch": 0.015600928255231187,
"grad_norm": 0.3309131860733032,
"learning_rate": 0.00019994755690455152,
"loss": 1.4658,
"step": 100
},
{
"epoch": 0.015756937537783497,
"grad_norm": 0.31126394867897034,
"learning_rate": 0.0001999432779525635,
"loss": 1.5518,
"step": 101
},
{
"epoch": 0.01591294682033581,
"grad_norm": 0.28427934646606445,
"learning_rate": 0.0001999388312632214,
"loss": 1.1435,
"step": 102
},
{
"epoch": 0.01606895610288812,
"grad_norm": 0.28065958619117737,
"learning_rate": 0.00019993421684398824,
"loss": 1.3537,
"step": 103
},
{
"epoch": 0.016224965385440434,
"grad_norm": 0.3787417411804199,
"learning_rate": 0.00019992943470260844,
"loss": 1.2151,
"step": 104
},
{
"epoch": 0.016380974667992744,
"grad_norm": 0.32704487442970276,
"learning_rate": 0.00019992448484710797,
"loss": 1.1383,
"step": 105
},
{
"epoch": 0.016536983950545057,
"grad_norm": 0.34436190128326416,
"learning_rate": 0.00019991936728579437,
"loss": 1.3949,
"step": 106
},
{
"epoch": 0.01669299323309737,
"grad_norm": 0.29938092827796936,
"learning_rate": 0.00019991408202725655,
"loss": 1.2821,
"step": 107
},
{
"epoch": 0.01684900251564968,
"grad_norm": 0.3192508816719055,
"learning_rate": 0.0001999086290803649,
"loss": 1.3655,
"step": 108
},
{
"epoch": 0.017005011798201995,
"grad_norm": 0.2626635730266571,
"learning_rate": 0.00019990300845427125,
"loss": 1.2366,
"step": 109
},
{
"epoch": 0.017161021080754305,
"grad_norm": 0.288725882768631,
"learning_rate": 0.0001998972201584088,
"loss": 1.0589,
"step": 110
},
{
"epoch": 0.017317030363306618,
"grad_norm": 0.3340204358100891,
"learning_rate": 0.00019989126420249221,
"loss": 1.7077,
"step": 111
},
{
"epoch": 0.017473039645858928,
"grad_norm": 0.27165043354034424,
"learning_rate": 0.00019988514059651752,
"loss": 1.3596,
"step": 112
},
{
"epoch": 0.01762904892841124,
"grad_norm": 0.2751217186450958,
"learning_rate": 0.00019987884935076213,
"loss": 1.281,
"step": 113
},
{
"epoch": 0.01778505821096355,
"grad_norm": 0.2712443172931671,
"learning_rate": 0.00019987239047578482,
"loss": 1.2686,
"step": 114
},
{
"epoch": 0.017941067493515865,
"grad_norm": 0.2898474931716919,
"learning_rate": 0.00019986576398242566,
"loss": 1.2425,
"step": 115
},
{
"epoch": 0.018097076776068175,
"grad_norm": 0.29883307218551636,
"learning_rate": 0.00019985896988180605,
"loss": 1.6326,
"step": 116
},
{
"epoch": 0.01825308605862049,
"grad_norm": 0.2548903524875641,
"learning_rate": 0.00019985200818532875,
"loss": 1.317,
"step": 117
},
{
"epoch": 0.0184090953411728,
"grad_norm": 0.260768860578537,
"learning_rate": 0.0001998448789046777,
"loss": 1.4137,
"step": 118
},
{
"epoch": 0.018565104623725112,
"grad_norm": 0.27813923358917236,
"learning_rate": 0.00019983758205181822,
"loss": 1.1758,
"step": 119
},
{
"epoch": 0.018721113906277422,
"grad_norm": 0.29539602994918823,
"learning_rate": 0.00019983011763899673,
"loss": 1.2805,
"step": 120
},
{
"epoch": 0.018877123188829736,
"grad_norm": 0.2691763937473297,
"learning_rate": 0.00019982248567874098,
"loss": 1.3098,
"step": 121
},
{
"epoch": 0.019033132471382046,
"grad_norm": 0.2895521819591522,
"learning_rate": 0.00019981468618385988,
"loss": 1.1475,
"step": 122
},
{
"epoch": 0.01918914175393436,
"grad_norm": 0.24555402994155884,
"learning_rate": 0.00019980671916744352,
"loss": 1.075,
"step": 123
},
{
"epoch": 0.01934515103648667,
"grad_norm": 0.29935726523399353,
"learning_rate": 0.00019979858464286317,
"loss": 1.278,
"step": 124
},
{
"epoch": 0.019501160319038983,
"grad_norm": 0.3469449579715729,
"learning_rate": 0.00019979028262377118,
"loss": 1.602,
"step": 125
},
{
"epoch": 0.019657169601591296,
"grad_norm": 0.2707567811012268,
"learning_rate": 0.00019978181312410104,
"loss": 1.3181,
"step": 126
},
{
"epoch": 0.019813178884143606,
"grad_norm": 0.32349273562431335,
"learning_rate": 0.00019977317615806737,
"loss": 1.4862,
"step": 127
},
{
"epoch": 0.01996918816669592,
"grad_norm": 0.24527911841869354,
"learning_rate": 0.00019976437174016573,
"loss": 1.169,
"step": 128
},
{
"epoch": 0.02012519744924823,
"grad_norm": 0.2882062494754791,
"learning_rate": 0.00019975539988517288,
"loss": 1.275,
"step": 129
},
{
"epoch": 0.020281206731800543,
"grad_norm": 0.3206437826156616,
"learning_rate": 0.00019974626060814647,
"loss": 1.682,
"step": 130
},
{
"epoch": 0.020437216014352853,
"grad_norm": 0.3423447012901306,
"learning_rate": 0.0001997369539244252,
"loss": 1.2018,
"step": 131
},
{
"epoch": 0.020593225296905167,
"grad_norm": 0.29081955552101135,
"learning_rate": 0.0001997274798496287,
"loss": 1.5849,
"step": 132
},
{
"epoch": 0.020749234579457477,
"grad_norm": 0.2659798860549927,
"learning_rate": 0.00019971783839965756,
"loss": 1.1371,
"step": 133
},
{
"epoch": 0.02090524386200979,
"grad_norm": 0.3395417034626007,
"learning_rate": 0.00019970802959069328,
"loss": 1.5046,
"step": 134
},
{
"epoch": 0.0210612531445621,
"grad_norm": 0.22527103126049042,
"learning_rate": 0.00019969805343919821,
"loss": 1.0543,
"step": 135
},
{
"epoch": 0.021217262427114414,
"grad_norm": 0.30680522322654724,
"learning_rate": 0.0001996879099619156,
"loss": 1.5067,
"step": 136
},
{
"epoch": 0.021373271709666724,
"grad_norm": 0.22828875482082367,
"learning_rate": 0.00019967759917586953,
"loss": 1.1201,
"step": 137
},
{
"epoch": 0.021529280992219037,
"grad_norm": 0.2578384280204773,
"learning_rate": 0.00019966712109836476,
"loss": 1.104,
"step": 138
},
{
"epoch": 0.021685290274771347,
"grad_norm": 0.23175813257694244,
"learning_rate": 0.000199656475746987,
"loss": 0.9706,
"step": 139
},
{
"epoch": 0.02184129955732366,
"grad_norm": 0.29308339953422546,
"learning_rate": 0.00019964566313960264,
"loss": 1.4769,
"step": 140
},
{
"epoch": 0.02199730883987597,
"grad_norm": 0.3059382438659668,
"learning_rate": 0.0001996346832943587,
"loss": 1.4555,
"step": 141
},
{
"epoch": 0.022153318122428284,
"grad_norm": 0.2929370701313019,
"learning_rate": 0.00019962353622968295,
"loss": 1.4051,
"step": 142
},
{
"epoch": 0.022309327404980598,
"grad_norm": 0.24365079402923584,
"learning_rate": 0.00019961222196428378,
"loss": 1.189,
"step": 143
},
{
"epoch": 0.022465336687532908,
"grad_norm": 0.27418485283851624,
"learning_rate": 0.0001996007405171502,
"loss": 1.206,
"step": 144
},
{
"epoch": 0.02262134597008522,
"grad_norm": 0.2554856836795807,
"learning_rate": 0.00019958909190755187,
"loss": 1.4053,
"step": 145
},
{
"epoch": 0.02277735525263753,
"grad_norm": 0.2674770951271057,
"learning_rate": 0.00019957727615503888,
"loss": 1.2412,
"step": 146
},
{
"epoch": 0.022933364535189845,
"grad_norm": 0.3177204728126526,
"learning_rate": 0.00019956529327944198,
"loss": 1.4231,
"step": 147
},
{
"epoch": 0.023089373817742155,
"grad_norm": 0.2678688168525696,
"learning_rate": 0.00019955314330087225,
"loss": 1.2494,
"step": 148
},
{
"epoch": 0.02324538310029447,
"grad_norm": 0.28164568543434143,
"learning_rate": 0.00019954082623972142,
"loss": 1.2008,
"step": 149
},
{
"epoch": 0.02340139238284678,
"grad_norm": 0.2897564172744751,
"learning_rate": 0.0001995283421166614,
"loss": 1.463,
"step": 150
},
{
"epoch": 0.023557401665399092,
"grad_norm": 0.276509165763855,
"learning_rate": 0.00019951569095264473,
"loss": 1.4891,
"step": 151
},
{
"epoch": 0.023713410947951402,
"grad_norm": 0.2585453689098358,
"learning_rate": 0.0001995028727689041,
"loss": 1.1551,
"step": 152
},
{
"epoch": 0.023869420230503716,
"grad_norm": 0.25659292936325073,
"learning_rate": 0.00019948988758695263,
"loss": 1.1622,
"step": 153
},
{
"epoch": 0.024025429513056026,
"grad_norm": 0.27132928371429443,
"learning_rate": 0.00019947673542858367,
"loss": 1.2015,
"step": 154
},
{
"epoch": 0.02418143879560834,
"grad_norm": 0.2951599955558777,
"learning_rate": 0.00019946341631587087,
"loss": 1.1842,
"step": 155
},
{
"epoch": 0.02433744807816065,
"grad_norm": 0.3114786148071289,
"learning_rate": 0.00019944993027116797,
"loss": 1.4509,
"step": 156
},
{
"epoch": 0.024493457360712963,
"grad_norm": 0.25183674693107605,
"learning_rate": 0.00019943627731710897,
"loss": 1.1474,
"step": 157
},
{
"epoch": 0.024649466643265273,
"grad_norm": 0.2717629075050354,
"learning_rate": 0.00019942245747660796,
"loss": 1.2899,
"step": 158
},
{
"epoch": 0.024805475925817586,
"grad_norm": 0.2532605826854706,
"learning_rate": 0.00019940847077285916,
"loss": 1.0811,
"step": 159
},
{
"epoch": 0.0249614852083699,
"grad_norm": 0.2951716482639313,
"learning_rate": 0.0001993943172293368,
"loss": 1.6252,
"step": 160
},
{
"epoch": 0.02511749449092221,
"grad_norm": 0.29894542694091797,
"learning_rate": 0.0001993799968697951,
"loss": 1.3754,
"step": 161
},
{
"epoch": 0.025273503773474523,
"grad_norm": 0.28648853302001953,
"learning_rate": 0.00019936550971826834,
"loss": 1.2769,
"step": 162
},
{
"epoch": 0.025429513056026833,
"grad_norm": 0.2540144920349121,
"learning_rate": 0.00019935085579907063,
"loss": 1.281,
"step": 163
},
{
"epoch": 0.025585522338579147,
"grad_norm": 0.30044910311698914,
"learning_rate": 0.00019933603513679605,
"loss": 1.1689,
"step": 164
},
{
"epoch": 0.025741531621131457,
"grad_norm": 0.31799909472465515,
"learning_rate": 0.00019932104775631846,
"loss": 1.287,
"step": 165
},
{
"epoch": 0.02589754090368377,
"grad_norm": 0.290565550327301,
"learning_rate": 0.0001993058936827916,
"loss": 1.4751,
"step": 166
},
{
"epoch": 0.02605355018623608,
"grad_norm": 0.28967443108558655,
"learning_rate": 0.00019929057294164893,
"loss": 1.2459,
"step": 167
},
{
"epoch": 0.026209559468788394,
"grad_norm": 0.25141966342926025,
"learning_rate": 0.0001992750855586036,
"loss": 1.1215,
"step": 168
},
{
"epoch": 0.026365568751340704,
"grad_norm": 0.2819644808769226,
"learning_rate": 0.00019925943155964856,
"loss": 1.5238,
"step": 169
},
{
"epoch": 0.026521578033893017,
"grad_norm": 0.2336016446352005,
"learning_rate": 0.00019924361097105623,
"loss": 1.2218,
"step": 170
},
{
"epoch": 0.026677587316445327,
"grad_norm": 0.23773479461669922,
"learning_rate": 0.00019922762381937878,
"loss": 1.0842,
"step": 171
},
{
"epoch": 0.02683359659899764,
"grad_norm": 0.266222208738327,
"learning_rate": 0.0001992114701314478,
"loss": 1.2076,
"step": 172
},
{
"epoch": 0.02698960588154995,
"grad_norm": 0.29275181889533997,
"learning_rate": 0.00019919514993437445,
"loss": 1.3901,
"step": 173
},
{
"epoch": 0.027145615164102264,
"grad_norm": 0.2334383726119995,
"learning_rate": 0.00019917866325554938,
"loss": 1.2012,
"step": 174
},
{
"epoch": 0.027301624446654574,
"grad_norm": 0.293888121843338,
"learning_rate": 0.00019916201012264254,
"loss": 1.6131,
"step": 175
},
{
"epoch": 0.027457633729206888,
"grad_norm": 0.3042750954627991,
"learning_rate": 0.0001991451905636033,
"loss": 1.3144,
"step": 176
},
{
"epoch": 0.027613643011759198,
"grad_norm": 0.2652626633644104,
"learning_rate": 0.00019912820460666044,
"loss": 1.4368,
"step": 177
},
{
"epoch": 0.02776965229431151,
"grad_norm": 0.28741374611854553,
"learning_rate": 0.00019911105228032186,
"loss": 1.4643,
"step": 178
},
{
"epoch": 0.027925661576863825,
"grad_norm": 0.2808038890361786,
"learning_rate": 0.00019909373361337476,
"loss": 1.3013,
"step": 179
},
{
"epoch": 0.028081670859416135,
"grad_norm": 0.22930848598480225,
"learning_rate": 0.0001990762486348855,
"loss": 1.0587,
"step": 180
},
{
"epoch": 0.02823768014196845,
"grad_norm": 0.24289073050022125,
"learning_rate": 0.00019905859737419956,
"loss": 1.1174,
"step": 181
},
{
"epoch": 0.02839368942452076,
"grad_norm": 0.2626672685146332,
"learning_rate": 0.00019904077986094152,
"loss": 1.1746,
"step": 182
},
{
"epoch": 0.028549698707073072,
"grad_norm": 0.3174870014190674,
"learning_rate": 0.00019902279612501493,
"loss": 1.4464,
"step": 183
},
{
"epoch": 0.028705707989625382,
"grad_norm": 0.2851637303829193,
"learning_rate": 0.0001990046461966024,
"loss": 1.3527,
"step": 184
},
{
"epoch": 0.028861717272177696,
"grad_norm": 0.2576538622379303,
"learning_rate": 0.00019898633010616542,
"loss": 1.2546,
"step": 185
},
{
"epoch": 0.029017726554730006,
"grad_norm": 0.2922312319278717,
"learning_rate": 0.0001989678478844443,
"loss": 1.1445,
"step": 186
},
{
"epoch": 0.02917373583728232,
"grad_norm": 0.25312724709510803,
"learning_rate": 0.00019894919956245824,
"loss": 1.0533,
"step": 187
},
{
"epoch": 0.02932974511983463,
"grad_norm": 0.3193413019180298,
"learning_rate": 0.00019893038517150525,
"loss": 1.655,
"step": 188
},
{
"epoch": 0.029485754402386943,
"grad_norm": 0.26104092597961426,
"learning_rate": 0.00019891140474316194,
"loss": 1.5094,
"step": 189
},
{
"epoch": 0.029641763684939253,
"grad_norm": 0.2679871916770935,
"learning_rate": 0.00019889225830928365,
"loss": 1.3535,
"step": 190
},
{
"epoch": 0.029797772967491566,
"grad_norm": 0.2835332751274109,
"learning_rate": 0.00019887294590200435,
"loss": 1.647,
"step": 191
},
{
"epoch": 0.029953782250043876,
"grad_norm": 0.2309991866350174,
"learning_rate": 0.00019885346755373656,
"loss": 1.1869,
"step": 192
},
{
"epoch": 0.03010979153259619,
"grad_norm": 0.28801408410072327,
"learning_rate": 0.00019883382329717128,
"loss": 1.4037,
"step": 193
},
{
"epoch": 0.0302658008151485,
"grad_norm": 0.309851735830307,
"learning_rate": 0.00019881401316527793,
"loss": 1.2832,
"step": 194
},
{
"epoch": 0.030421810097700813,
"grad_norm": 0.27529048919677734,
"learning_rate": 0.0001987940371913044,
"loss": 1.5466,
"step": 195
},
{
"epoch": 0.030577819380253127,
"grad_norm": 0.25759854912757874,
"learning_rate": 0.00019877389540877687,
"loss": 1.2432,
"step": 196
},
{
"epoch": 0.030733828662805437,
"grad_norm": 0.27557173371315,
"learning_rate": 0.0001987535878514998,
"loss": 1.5681,
"step": 197
},
{
"epoch": 0.03088983794535775,
"grad_norm": 0.25760918855667114,
"learning_rate": 0.0001987331145535559,
"loss": 1.3067,
"step": 198
},
{
"epoch": 0.03104584722791006,
"grad_norm": 0.299180269241333,
"learning_rate": 0.000198712475549306,
"loss": 1.4642,
"step": 199
},
{
"epoch": 0.031201856510462374,
"grad_norm": 0.2398681640625,
"learning_rate": 0.00019869167087338907,
"loss": 1.0748,
"step": 200
},
{
"epoch": 0.03135786579301469,
"grad_norm": 0.2560211420059204,
"learning_rate": 0.00019867070056072214,
"loss": 1.2508,
"step": 201
},
{
"epoch": 0.031513875075566994,
"grad_norm": 0.25509408116340637,
"learning_rate": 0.00019864956464650025,
"loss": 1.4073,
"step": 202
},
{
"epoch": 0.03166988435811931,
"grad_norm": 0.27500587701797485,
"learning_rate": 0.00019862826316619628,
"loss": 1.3473,
"step": 203
},
{
"epoch": 0.03182589364067162,
"grad_norm": 0.2923906445503235,
"learning_rate": 0.0001986067961555611,
"loss": 1.4293,
"step": 204
},
{
"epoch": 0.031981902923223934,
"grad_norm": 0.24456267058849335,
"learning_rate": 0.00019858516365062334,
"loss": 1.2196,
"step": 205
},
{
"epoch": 0.03213791220577624,
"grad_norm": 0.3021962344646454,
"learning_rate": 0.00019856336568768935,
"loss": 1.5066,
"step": 206
},
{
"epoch": 0.032293921488328554,
"grad_norm": 0.2485729455947876,
"learning_rate": 0.00019854140230334322,
"loss": 1.2002,
"step": 207
},
{
"epoch": 0.03244993077088087,
"grad_norm": 0.26055216789245605,
"learning_rate": 0.0001985192735344467,
"loss": 1.3207,
"step": 208
},
{
"epoch": 0.03260594005343318,
"grad_norm": 0.2658592760562897,
"learning_rate": 0.00019849697941813898,
"loss": 0.9025,
"step": 209
},
{
"epoch": 0.03276194933598549,
"grad_norm": 0.30481112003326416,
"learning_rate": 0.00019847451999183694,
"loss": 1.5238,
"step": 210
},
{
"epoch": 0.0329179586185378,
"grad_norm": 0.28382736444473267,
"learning_rate": 0.00019845189529323475,
"loss": 1.3224,
"step": 211
},
{
"epoch": 0.033073967901090115,
"grad_norm": 0.2757686972618103,
"learning_rate": 0.00019842910536030403,
"loss": 1.3672,
"step": 212
},
{
"epoch": 0.03322997718364243,
"grad_norm": 0.2743508219718933,
"learning_rate": 0.00019840615023129372,
"loss": 1.3628,
"step": 213
},
{
"epoch": 0.03338598646619474,
"grad_norm": 0.26412197947502136,
"learning_rate": 0.00019838302994472997,
"loss": 1.141,
"step": 214
},
{
"epoch": 0.03354199574874705,
"grad_norm": 0.2859683632850647,
"learning_rate": 0.0001983597445394162,
"loss": 1.1566,
"step": 215
},
{
"epoch": 0.03369800503129936,
"grad_norm": 0.24881964921951294,
"learning_rate": 0.00019833629405443284,
"loss": 1.2038,
"step": 216
},
{
"epoch": 0.033854014313851676,
"grad_norm": 0.25597479939460754,
"learning_rate": 0.0001983126785291375,
"loss": 0.9913,
"step": 217
},
{
"epoch": 0.03401002359640399,
"grad_norm": 0.26771095395088196,
"learning_rate": 0.00019828889800316466,
"loss": 1.5417,
"step": 218
},
{
"epoch": 0.034166032878956296,
"grad_norm": 0.2678371071815491,
"learning_rate": 0.00019826495251642578,
"loss": 1.208,
"step": 219
},
{
"epoch": 0.03432204216150861,
"grad_norm": 0.2947763204574585,
"learning_rate": 0.00019824084210910925,
"loss": 1.3908,
"step": 220
},
{
"epoch": 0.03447805144406092,
"grad_norm": 0.2821643650531769,
"learning_rate": 0.00019821656682168012,
"loss": 1.6573,
"step": 221
},
{
"epoch": 0.034634060726613236,
"grad_norm": 0.24507346749305725,
"learning_rate": 0.00019819212669488026,
"loss": 1.0647,
"step": 222
},
{
"epoch": 0.03479007000916554,
"grad_norm": 0.2718466520309448,
"learning_rate": 0.00019816752176972813,
"loss": 1.3013,
"step": 223
},
{
"epoch": 0.034946079291717856,
"grad_norm": 0.2902746796607971,
"learning_rate": 0.0001981427520875188,
"loss": 1.2212,
"step": 224
},
{
"epoch": 0.03510208857427017,
"grad_norm": 0.25822389125823975,
"learning_rate": 0.0001981178176898239,
"loss": 1.4543,
"step": 225
},
{
"epoch": 0.03525809785682248,
"grad_norm": 0.3506292700767517,
"learning_rate": 0.00019809271861849145,
"loss": 1.8549,
"step": 226
},
{
"epoch": 0.03541410713937479,
"grad_norm": 0.2610777020454407,
"learning_rate": 0.00019806745491564586,
"loss": 1.3161,
"step": 227
},
{
"epoch": 0.0355701164219271,
"grad_norm": 0.29803603887557983,
"learning_rate": 0.0001980420266236878,
"loss": 1.2983,
"step": 228
},
{
"epoch": 0.03572612570447942,
"grad_norm": 0.24572676420211792,
"learning_rate": 0.0001980164337852943,
"loss": 1.291,
"step": 229
},
{
"epoch": 0.03588213498703173,
"grad_norm": 0.25573092699050903,
"learning_rate": 0.00019799067644341844,
"loss": 1.3207,
"step": 230
},
{
"epoch": 0.036038144269584044,
"grad_norm": 0.28766271471977234,
"learning_rate": 0.00019796475464128942,
"loss": 1.4527,
"step": 231
},
{
"epoch": 0.03619415355213635,
"grad_norm": 0.2636454701423645,
"learning_rate": 0.00019793866842241243,
"loss": 1.3899,
"step": 232
},
{
"epoch": 0.036350162834688664,
"grad_norm": 0.3094368577003479,
"learning_rate": 0.00019791241783056874,
"loss": 1.2935,
"step": 233
},
{
"epoch": 0.03650617211724098,
"grad_norm": 0.2588469088077545,
"learning_rate": 0.00019788600290981525,
"loss": 1.2457,
"step": 234
},
{
"epoch": 0.03666218139979329,
"grad_norm": 0.26457706093788147,
"learning_rate": 0.0001978594237044849,
"loss": 1.1753,
"step": 235
},
{
"epoch": 0.0368181906823456,
"grad_norm": 0.2559141516685486,
"learning_rate": 0.0001978326802591862,
"loss": 1.2004,
"step": 236
},
{
"epoch": 0.03697419996489791,
"grad_norm": 0.2815738320350647,
"learning_rate": 0.00019780577261880336,
"loss": 1.3706,
"step": 237
},
{
"epoch": 0.037130209247450224,
"grad_norm": 0.2584588825702667,
"learning_rate": 0.0001977787008284962,
"loss": 1.4192,
"step": 238
},
{
"epoch": 0.03728621853000254,
"grad_norm": 0.290865421295166,
"learning_rate": 0.00019775146493369994,
"loss": 1.2308,
"step": 239
},
{
"epoch": 0.037442227812554844,
"grad_norm": 0.2788088023662567,
"learning_rate": 0.0001977240649801253,
"loss": 1.2095,
"step": 240
},
{
"epoch": 0.03759823709510716,
"grad_norm": 0.28903988003730774,
"learning_rate": 0.00019769650101375837,
"loss": 1.5138,
"step": 241
},
{
"epoch": 0.03775424637765947,
"grad_norm": 0.29985305666923523,
"learning_rate": 0.00019766877308086036,
"loss": 1.4594,
"step": 242
},
{
"epoch": 0.037910255660211785,
"grad_norm": 0.3033303916454315,
"learning_rate": 0.00019764088122796783,
"loss": 1.6108,
"step": 243
},
{
"epoch": 0.03806626494276409,
"grad_norm": 0.2854767143726349,
"learning_rate": 0.0001976128255018924,
"loss": 1.377,
"step": 244
},
{
"epoch": 0.038222274225316405,
"grad_norm": 0.30725011229515076,
"learning_rate": 0.00019758460594972068,
"loss": 1.2651,
"step": 245
},
{
"epoch": 0.03837828350786872,
"grad_norm": 0.28218191862106323,
"learning_rate": 0.00019755622261881427,
"loss": 1.4354,
"step": 246
},
{
"epoch": 0.03853429279042103,
"grad_norm": 0.2794611155986786,
"learning_rate": 0.00019752767555680968,
"loss": 1.4666,
"step": 247
},
{
"epoch": 0.03869030207297334,
"grad_norm": 0.2824796736240387,
"learning_rate": 0.00019749896481161808,
"loss": 1.3645,
"step": 248
},
{
"epoch": 0.03884631135552565,
"grad_norm": 0.26165372133255005,
"learning_rate": 0.00019747009043142555,
"loss": 1.3445,
"step": 249
},
{
"epoch": 0.039002320638077966,
"grad_norm": 0.29985979199409485,
"learning_rate": 0.00019744105246469263,
"loss": 1.4558,
"step": 250
},
{
"epoch": 0.03915832992063028,
"grad_norm": 0.25439903140068054,
"learning_rate": 0.00019741185096015448,
"loss": 1.1075,
"step": 251
},
{
"epoch": 0.03931433920318259,
"grad_norm": 0.2533755898475647,
"learning_rate": 0.00019738248596682078,
"loss": 1.0891,
"step": 252
},
{
"epoch": 0.0394703484857349,
"grad_norm": 0.27487608790397644,
"learning_rate": 0.0001973529575339755,
"loss": 1.3128,
"step": 253
},
{
"epoch": 0.03962635776828721,
"grad_norm": 0.27824172377586365,
"learning_rate": 0.00019732326571117703,
"loss": 1.4045,
"step": 254
},
{
"epoch": 0.039782367050839526,
"grad_norm": 0.27959418296813965,
"learning_rate": 0.00019729341054825782,
"loss": 1.2169,
"step": 255
},
{
"epoch": 0.03993837633339184,
"grad_norm": 0.3103275001049042,
"learning_rate": 0.00019726339209532462,
"loss": 1.3043,
"step": 256
},
{
"epoch": 0.040094385615944146,
"grad_norm": 0.2712806463241577,
"learning_rate": 0.00019723321040275815,
"loss": 1.1747,
"step": 257
},
{
"epoch": 0.04025039489849646,
"grad_norm": 0.2961602210998535,
"learning_rate": 0.0001972028655212131,
"loss": 1.5744,
"step": 258
},
{
"epoch": 0.04040640418104877,
"grad_norm": 0.2686194181442261,
"learning_rate": 0.00019717235750161806,
"loss": 1.2442,
"step": 259
},
{
"epoch": 0.04056241346360109,
"grad_norm": 0.2742723822593689,
"learning_rate": 0.00019714168639517544,
"loss": 1.3225,
"step": 260
},
{
"epoch": 0.04071842274615339,
"grad_norm": 0.28742754459381104,
"learning_rate": 0.00019711085225336132,
"loss": 1.3711,
"step": 261
},
{
"epoch": 0.04087443202870571,
"grad_norm": 0.30374589562416077,
"learning_rate": 0.00019707985512792543,
"loss": 1.215,
"step": 262
},
{
"epoch": 0.04103044131125802,
"grad_norm": 0.2738686800003052,
"learning_rate": 0.00019704869507089105,
"loss": 1.4628,
"step": 263
},
{
"epoch": 0.041186450593810334,
"grad_norm": 0.2695278823375702,
"learning_rate": 0.0001970173721345549,
"loss": 1.4632,
"step": 264
},
{
"epoch": 0.04134245987636264,
"grad_norm": 0.2954547107219696,
"learning_rate": 0.00019698588637148703,
"loss": 1.2785,
"step": 265
},
{
"epoch": 0.041498469158914954,
"grad_norm": 0.2756305932998657,
"learning_rate": 0.00019695423783453088,
"loss": 1.4258,
"step": 266
},
{
"epoch": 0.04165447844146727,
"grad_norm": 0.2642769515514374,
"learning_rate": 0.00019692242657680286,
"loss": 1.3034,
"step": 267
},
{
"epoch": 0.04181048772401958,
"grad_norm": 0.2760365307331085,
"learning_rate": 0.00019689045265169273,
"loss": 1.5845,
"step": 268
},
{
"epoch": 0.041966497006571894,
"grad_norm": 0.23845522105693817,
"learning_rate": 0.0001968583161128631,
"loss": 1.113,
"step": 269
},
{
"epoch": 0.0421225062891242,
"grad_norm": 0.2855961322784424,
"learning_rate": 0.0001968260170142496,
"loss": 1.4019,
"step": 270
},
{
"epoch": 0.042278515571676514,
"grad_norm": 0.26462671160697937,
"learning_rate": 0.00019679355541006054,
"loss": 1.2425,
"step": 271
},
{
"epoch": 0.04243452485422883,
"grad_norm": 0.28468820452690125,
"learning_rate": 0.00019676093135477713,
"loss": 1.6525,
"step": 272
},
{
"epoch": 0.04259053413678114,
"grad_norm": 0.3233076333999634,
"learning_rate": 0.0001967281449031531,
"loss": 1.2168,
"step": 273
},
{
"epoch": 0.04274654341933345,
"grad_norm": 0.2688952684402466,
"learning_rate": 0.00019669519611021486,
"loss": 1.3948,
"step": 274
},
{
"epoch": 0.04290255270188576,
"grad_norm": 0.25911059975624084,
"learning_rate": 0.00019666208503126112,
"loss": 1.2875,
"step": 275
},
{
"epoch": 0.043058561984438075,
"grad_norm": 0.2789272964000702,
"learning_rate": 0.00019662881172186313,
"loss": 1.257,
"step": 276
},
{
"epoch": 0.04321457126699039,
"grad_norm": 0.26854726672172546,
"learning_rate": 0.00019659537623786428,
"loss": 1.4554,
"step": 277
},
{
"epoch": 0.043370580549542695,
"grad_norm": 0.31813284754753113,
"learning_rate": 0.00019656177863538026,
"loss": 1.667,
"step": 278
},
{
"epoch": 0.04352658983209501,
"grad_norm": 0.2801772356033325,
"learning_rate": 0.00019652801897079869,
"loss": 1.4555,
"step": 279
},
{
"epoch": 0.04368259911464732,
"grad_norm": 0.30256757140159607,
"learning_rate": 0.00019649409730077935,
"loss": 1.2628,
"step": 280
},
{
"epoch": 0.043838608397199635,
"grad_norm": 0.2807087302207947,
"learning_rate": 0.00019646001368225382,
"loss": 1.5143,
"step": 281
},
{
"epoch": 0.04399461767975194,
"grad_norm": 0.27217531204223633,
"learning_rate": 0.0001964257681724255,
"loss": 1.5372,
"step": 282
},
{
"epoch": 0.044150626962304255,
"grad_norm": 0.2996511459350586,
"learning_rate": 0.00019639136082876953,
"loss": 1.2692,
"step": 283
},
{
"epoch": 0.04430663624485657,
"grad_norm": 0.263231098651886,
"learning_rate": 0.00019635679170903258,
"loss": 1.2328,
"step": 284
},
{
"epoch": 0.04446264552740888,
"grad_norm": 0.3060413897037506,
"learning_rate": 0.00019632206087123296,
"loss": 1.5173,
"step": 285
},
{
"epoch": 0.044618654809961196,
"grad_norm": 0.25136467814445496,
"learning_rate": 0.00019628716837366027,
"loss": 1.1781,
"step": 286
},
{
"epoch": 0.0447746640925135,
"grad_norm": 0.27105534076690674,
"learning_rate": 0.00019625211427487548,
"loss": 1.4542,
"step": 287
},
{
"epoch": 0.044930673375065816,
"grad_norm": 0.27552956342697144,
"learning_rate": 0.00019621689863371083,
"loss": 1.3352,
"step": 288
},
{
"epoch": 0.04508668265761813,
"grad_norm": 0.26462072134017944,
"learning_rate": 0.00019618152150926955,
"loss": 1.2531,
"step": 289
},
{
"epoch": 0.04524269194017044,
"grad_norm": 0.2736480236053467,
"learning_rate": 0.000196145982960926,
"loss": 1.402,
"step": 290
},
{
"epoch": 0.04539870122272275,
"grad_norm": 0.2739974856376648,
"learning_rate": 0.00019611028304832546,
"loss": 1.4881,
"step": 291
},
{
"epoch": 0.04555471050527506,
"grad_norm": 0.25353673100471497,
"learning_rate": 0.000196074421831384,
"loss": 1.3935,
"step": 292
},
{
"epoch": 0.04571071978782738,
"grad_norm": 0.2595098614692688,
"learning_rate": 0.00019603839937028838,
"loss": 1.3306,
"step": 293
},
{
"epoch": 0.04586672907037969,
"grad_norm": 0.27779051661491394,
"learning_rate": 0.00019600221572549606,
"loss": 1.5111,
"step": 294
},
{
"epoch": 0.046022738352932,
"grad_norm": 0.26458942890167236,
"learning_rate": 0.00019596587095773495,
"loss": 1.1354,
"step": 295
},
{
"epoch": 0.04617874763548431,
"grad_norm": 0.3711000084877014,
"learning_rate": 0.00019592936512800342,
"loss": 1.387,
"step": 296
},
{
"epoch": 0.046334756918036624,
"grad_norm": 0.26172423362731934,
"learning_rate": 0.00019589269829757008,
"loss": 1.1995,
"step": 297
},
{
"epoch": 0.04649076620058894,
"grad_norm": 0.30684447288513184,
"learning_rate": 0.00019585587052797389,
"loss": 1.2853,
"step": 298
},
{
"epoch": 0.046646775483141244,
"grad_norm": 0.27383920550346375,
"learning_rate": 0.00019581888188102375,
"loss": 1.1397,
"step": 299
},
{
"epoch": 0.04680278476569356,
"grad_norm": 0.28926682472229004,
"learning_rate": 0.00019578173241879872,
"loss": 1.2977,
"step": 300
},
{
"epoch": 0.04695879404824587,
"grad_norm": 0.2573678195476532,
"learning_rate": 0.00019574442220364767,
"loss": 1.315,
"step": 301
},
{
"epoch": 0.047114803330798184,
"grad_norm": 0.286785751581192,
"learning_rate": 0.00019570695129818926,
"loss": 1.196,
"step": 302
},
{
"epoch": 0.0472708126133505,
"grad_norm": 0.26392433047294617,
"learning_rate": 0.0001956693197653119,
"loss": 1.067,
"step": 303
},
{
"epoch": 0.047426821895902804,
"grad_norm": 0.29351645708084106,
"learning_rate": 0.00019563152766817354,
"loss": 1.2977,
"step": 304
},
{
"epoch": 0.04758283117845512,
"grad_norm": 0.3556276857852936,
"learning_rate": 0.00019559357507020162,
"loss": 1.1268,
"step": 305
},
{
"epoch": 0.04773884046100743,
"grad_norm": 0.3044413924217224,
"learning_rate": 0.00019555546203509297,
"loss": 1.3528,
"step": 306
},
{
"epoch": 0.047894849743559745,
"grad_norm": 0.25455671548843384,
"learning_rate": 0.00019551718862681364,
"loss": 1.2099,
"step": 307
},
{
"epoch": 0.04805085902611205,
"grad_norm": 0.2863021492958069,
"learning_rate": 0.00019547875490959885,
"loss": 1.514,
"step": 308
},
{
"epoch": 0.048206868308664365,
"grad_norm": 0.2713131010532379,
"learning_rate": 0.00019544016094795295,
"loss": 1.2479,
"step": 309
},
{
"epoch": 0.04836287759121668,
"grad_norm": 0.28673309087753296,
"learning_rate": 0.00019540140680664913,
"loss": 1.4822,
"step": 310
},
{
"epoch": 0.04851888687376899,
"grad_norm": 0.28506314754486084,
"learning_rate": 0.00019536249255072948,
"loss": 1.1714,
"step": 311
},
{
"epoch": 0.0486748961563213,
"grad_norm": 0.2814370393753052,
"learning_rate": 0.00019532341824550479,
"loss": 1.3045,
"step": 312
},
{
"epoch": 0.04883090543887361,
"grad_norm": 0.2505611181259155,
"learning_rate": 0.0001952841839565544,
"loss": 1.1565,
"step": 313
},
{
"epoch": 0.048986914721425925,
"grad_norm": 0.27159830927848816,
"learning_rate": 0.0001952447897497263,
"loss": 1.0939,
"step": 314
},
{
"epoch": 0.04914292400397824,
"grad_norm": 0.27552008628845215,
"learning_rate": 0.00019520523569113677,
"loss": 1.4382,
"step": 315
},
{
"epoch": 0.049298933286530545,
"grad_norm": 0.2567708492279053,
"learning_rate": 0.00019516552184717037,
"loss": 1.2241,
"step": 316
},
{
"epoch": 0.04945494256908286,
"grad_norm": 0.27663713693618774,
"learning_rate": 0.00019512564828447988,
"loss": 1.2449,
"step": 317
},
{
"epoch": 0.04961095185163517,
"grad_norm": 0.2683660089969635,
"learning_rate": 0.0001950856150699861,
"loss": 1.1652,
"step": 318
},
{
"epoch": 0.049766961134187486,
"grad_norm": 0.25226572155952454,
"learning_rate": 0.0001950454222708778,
"loss": 1.1307,
"step": 319
},
{
"epoch": 0.0499229704167398,
"grad_norm": 0.23380513489246368,
"learning_rate": 0.0001950050699546116,
"loss": 1.1257,
"step": 320
},
{
"epoch": 0.050078979699292106,
"grad_norm": 0.2385280281305313,
"learning_rate": 0.0001949645581889118,
"loss": 0.9917,
"step": 321
},
{
"epoch": 0.05023498898184442,
"grad_norm": 0.23746567964553833,
"learning_rate": 0.00019492388704177036,
"loss": 1.1364,
"step": 322
},
{
"epoch": 0.05039099826439673,
"grad_norm": 0.27820831537246704,
"learning_rate": 0.00019488305658144667,
"loss": 1.3707,
"step": 323
},
{
"epoch": 0.050547007546949047,
"grad_norm": 0.2663419544696808,
"learning_rate": 0.00019484206687646753,
"loss": 1.3662,
"step": 324
},
{
"epoch": 0.05070301682950135,
"grad_norm": 0.27196773886680603,
"learning_rate": 0.00019480091799562704,
"loss": 1.2766,
"step": 325
},
{
"epoch": 0.05085902611205367,
"grad_norm": 0.296779602766037,
"learning_rate": 0.00019475961000798645,
"loss": 1.5789,
"step": 326
},
{
"epoch": 0.05101503539460598,
"grad_norm": 0.3267677128314972,
"learning_rate": 0.0001947181429828739,
"loss": 1.2782,
"step": 327
},
{
"epoch": 0.051171044677158294,
"grad_norm": 0.2852894067764282,
"learning_rate": 0.00019467651698988462,
"loss": 1.1466,
"step": 328
},
{
"epoch": 0.0513270539597106,
"grad_norm": 0.2959722876548767,
"learning_rate": 0.0001946347320988806,
"loss": 1.1929,
"step": 329
},
{
"epoch": 0.051483063242262914,
"grad_norm": 0.25998443365097046,
"learning_rate": 0.00019459278837999046,
"loss": 1.4104,
"step": 330
},
{
"epoch": 0.05163907252481523,
"grad_norm": 0.27319809794425964,
"learning_rate": 0.00019455068590360942,
"loss": 1.417,
"step": 331
},
{
"epoch": 0.05179508180736754,
"grad_norm": 0.22395959496498108,
"learning_rate": 0.00019450842474039913,
"loss": 1.2159,
"step": 332
},
{
"epoch": 0.05195109108991985,
"grad_norm": 0.24947980046272278,
"learning_rate": 0.00019446600496128758,
"loss": 1.1063,
"step": 333
},
{
"epoch": 0.05210710037247216,
"grad_norm": 0.235429584980011,
"learning_rate": 0.00019442342663746902,
"loss": 1.2234,
"step": 334
},
{
"epoch": 0.052263109655024474,
"grad_norm": 0.27443963289260864,
"learning_rate": 0.00019438068984040365,
"loss": 1.2038,
"step": 335
},
{
"epoch": 0.05241911893757679,
"grad_norm": 0.26688772439956665,
"learning_rate": 0.00019433779464181778,
"loss": 1.2956,
"step": 336
},
{
"epoch": 0.052575128220129094,
"grad_norm": 0.23804551362991333,
"learning_rate": 0.00019429474111370352,
"loss": 0.9525,
"step": 337
},
{
"epoch": 0.05273113750268141,
"grad_norm": 0.262890487909317,
"learning_rate": 0.0001942515293283187,
"loss": 1.2713,
"step": 338
},
{
"epoch": 0.05288714678523372,
"grad_norm": 0.29796820878982544,
"learning_rate": 0.00019420815935818672,
"loss": 1.5058,
"step": 339
},
{
"epoch": 0.053043156067786035,
"grad_norm": 0.275143563747406,
"learning_rate": 0.00019416463127609656,
"loss": 1.2604,
"step": 340
},
{
"epoch": 0.05319916535033835,
"grad_norm": 0.27801284193992615,
"learning_rate": 0.00019412094515510248,
"loss": 1.2588,
"step": 341
},
{
"epoch": 0.053355174632890655,
"grad_norm": 0.2604374885559082,
"learning_rate": 0.00019407710106852404,
"loss": 1.1432,
"step": 342
},
{
"epoch": 0.05351118391544297,
"grad_norm": 0.2863079011440277,
"learning_rate": 0.00019403309908994586,
"loss": 1.4854,
"step": 343
},
{
"epoch": 0.05366719319799528,
"grad_norm": 0.2515758275985718,
"learning_rate": 0.00019398893929321761,
"loss": 1.1682,
"step": 344
},
{
"epoch": 0.053823202480547595,
"grad_norm": 0.27037686109542847,
"learning_rate": 0.00019394462175245381,
"loss": 1.3679,
"step": 345
},
{
"epoch": 0.0539792117630999,
"grad_norm": 0.2368054836988449,
"learning_rate": 0.00019390014654203369,
"loss": 1.1406,
"step": 346
},
{
"epoch": 0.054135221045652215,
"grad_norm": 0.27759966254234314,
"learning_rate": 0.0001938555137366011,
"loss": 1.1669,
"step": 347
},
{
"epoch": 0.05429123032820453,
"grad_norm": 0.3004835546016693,
"learning_rate": 0.00019381072341106452,
"loss": 1.4811,
"step": 348
},
{
"epoch": 0.05444723961075684,
"grad_norm": 0.30656251311302185,
"learning_rate": 0.0001937657756405966,
"loss": 1.515,
"step": 349
},
{
"epoch": 0.05460324889330915,
"grad_norm": 0.31442925333976746,
"learning_rate": 0.00019372067050063438,
"loss": 1.4848,
"step": 350
},
{
"epoch": 0.05475925817586146,
"grad_norm": 0.2230207473039627,
"learning_rate": 0.00019367540806687893,
"loss": 0.9535,
"step": 351
},
{
"epoch": 0.054915267458413776,
"grad_norm": 0.2552795708179474,
"learning_rate": 0.0001936299884152954,
"loss": 1.2254,
"step": 352
},
{
"epoch": 0.05507127674096609,
"grad_norm": 0.29775241017341614,
"learning_rate": 0.0001935844116221127,
"loss": 1.3821,
"step": 353
},
{
"epoch": 0.055227286023518396,
"grad_norm": 0.24480530619621277,
"learning_rate": 0.00019353867776382354,
"loss": 1.1073,
"step": 354
},
{
"epoch": 0.05538329530607071,
"grad_norm": 0.2612270414829254,
"learning_rate": 0.00019349278691718427,
"loss": 1.3114,
"step": 355
},
{
"epoch": 0.05553930458862302,
"grad_norm": 0.307085245847702,
"learning_rate": 0.0001934467391592146,
"loss": 1.3602,
"step": 356
},
{
"epoch": 0.055695313871175336,
"grad_norm": 0.2688599228858948,
"learning_rate": 0.00019340053456719768,
"loss": 1.4347,
"step": 357
},
{
"epoch": 0.05585132315372765,
"grad_norm": 0.25372791290283203,
"learning_rate": 0.00019335417321867987,
"loss": 1.3468,
"step": 358
},
{
"epoch": 0.05600733243627996,
"grad_norm": 0.2706502377986908,
"learning_rate": 0.0001933076551914706,
"loss": 1.4489,
"step": 359
},
{
"epoch": 0.05616334171883227,
"grad_norm": 0.22997525334358215,
"learning_rate": 0.00019326098056364222,
"loss": 1.1305,
"step": 360
},
{
"epoch": 0.056319351001384584,
"grad_norm": 0.30573347210884094,
"learning_rate": 0.00019321414941353003,
"loss": 1.4231,
"step": 361
},
{
"epoch": 0.0564753602839369,
"grad_norm": 0.30873847007751465,
"learning_rate": 0.00019316716181973188,
"loss": 1.3478,
"step": 362
},
{
"epoch": 0.056631369566489204,
"grad_norm": 0.2514902651309967,
"learning_rate": 0.00019312001786110828,
"loss": 1.2094,
"step": 363
},
{
"epoch": 0.05678737884904152,
"grad_norm": 0.26067742705345154,
"learning_rate": 0.00019307271761678213,
"loss": 1.5841,
"step": 364
},
{
"epoch": 0.05694338813159383,
"grad_norm": 0.23508694767951965,
"learning_rate": 0.00019302526116613864,
"loss": 1.103,
"step": 365
},
{
"epoch": 0.057099397414146144,
"grad_norm": 0.24878567457199097,
"learning_rate": 0.00019297764858882514,
"loss": 1.0968,
"step": 366
},
{
"epoch": 0.05725540669669845,
"grad_norm": 0.23707476258277893,
"learning_rate": 0.00019292987996475113,
"loss": 1.0831,
"step": 367
},
{
"epoch": 0.057411415979250764,
"grad_norm": 0.2691617012023926,
"learning_rate": 0.0001928819553740878,
"loss": 1.2254,
"step": 368
},
{
"epoch": 0.05756742526180308,
"grad_norm": 0.26831138134002686,
"learning_rate": 0.00019283387489726827,
"loss": 1.3084,
"step": 369
},
{
"epoch": 0.05772343454435539,
"grad_norm": 0.281770259141922,
"learning_rate": 0.00019278563861498723,
"loss": 1.3377,
"step": 370
},
{
"epoch": 0.0578794438269077,
"grad_norm": 0.2634589970111847,
"learning_rate": 0.00019273724660820088,
"loss": 1.2648,
"step": 371
},
{
"epoch": 0.05803545310946001,
"grad_norm": 0.27592259645462036,
"learning_rate": 0.00019268869895812672,
"loss": 1.2751,
"step": 372
},
{
"epoch": 0.058191462392012325,
"grad_norm": 0.23107245564460754,
"learning_rate": 0.00019263999574624355,
"loss": 1.2651,
"step": 373
},
{
"epoch": 0.05834747167456464,
"grad_norm": 0.2582552134990692,
"learning_rate": 0.0001925911370542912,
"loss": 1.4914,
"step": 374
},
{
"epoch": 0.05850348095711695,
"grad_norm": 0.27152058482170105,
"learning_rate": 0.00019254212296427044,
"loss": 1.2227,
"step": 375
},
{
"epoch": 0.05865949023966926,
"grad_norm": 0.23554329574108124,
"learning_rate": 0.00019249295355844285,
"loss": 1.4113,
"step": 376
},
{
"epoch": 0.05881549952222157,
"grad_norm": 0.2793971300125122,
"learning_rate": 0.00019244362891933077,
"loss": 1.3325,
"step": 377
},
{
"epoch": 0.058971508804773885,
"grad_norm": 0.2800885736942291,
"learning_rate": 0.00019239414912971696,
"loss": 1.358,
"step": 378
},
{
"epoch": 0.0591275180873262,
"grad_norm": 0.27139201760292053,
"learning_rate": 0.0001923445142726446,
"loss": 1.2269,
"step": 379
},
{
"epoch": 0.059283527369878505,
"grad_norm": 0.276579886674881,
"learning_rate": 0.0001922947244314172,
"loss": 1.1521,
"step": 380
},
{
"epoch": 0.05943953665243082,
"grad_norm": 0.28917452692985535,
"learning_rate": 0.0001922447796895982,
"loss": 1.2803,
"step": 381
},
{
"epoch": 0.05959554593498313,
"grad_norm": 0.28668197989463806,
"learning_rate": 0.00019219468013101124,
"loss": 1.4025,
"step": 382
},
{
"epoch": 0.059751555217535446,
"grad_norm": 0.2973851263523102,
"learning_rate": 0.00019214442583973966,
"loss": 1.5472,
"step": 383
},
{
"epoch": 0.05990756450008775,
"grad_norm": 0.25934460759162903,
"learning_rate": 0.00019209401690012653,
"loss": 1.2496,
"step": 384
},
{
"epoch": 0.060063573782640066,
"grad_norm": 0.22885724902153015,
"learning_rate": 0.00019204345339677442,
"loss": 1.2088,
"step": 385
},
{
"epoch": 0.06021958306519238,
"grad_norm": 0.28346025943756104,
"learning_rate": 0.00019199273541454538,
"loss": 1.1561,
"step": 386
},
{
"epoch": 0.06037559234774469,
"grad_norm": 0.2574789822101593,
"learning_rate": 0.00019194186303856067,
"loss": 1.3209,
"step": 387
},
{
"epoch": 0.060531601630297,
"grad_norm": 0.26535728573799133,
"learning_rate": 0.00019189083635420075,
"loss": 1.3022,
"step": 388
},
{
"epoch": 0.06068761091284931,
"grad_norm": 0.2844642698764801,
"learning_rate": 0.00019183965544710495,
"loss": 1.3881,
"step": 389
},
{
"epoch": 0.060843620195401626,
"grad_norm": 0.24562187492847443,
"learning_rate": 0.00019178832040317155,
"loss": 1.159,
"step": 390
},
{
"epoch": 0.06099962947795394,
"grad_norm": 0.25778669118881226,
"learning_rate": 0.0001917368313085574,
"loss": 1.5154,
"step": 391
},
{
"epoch": 0.061155638760506253,
"grad_norm": 0.22877171635627747,
"learning_rate": 0.00019168518824967795,
"loss": 1.201,
"step": 392
},
{
"epoch": 0.06131164804305856,
"grad_norm": 0.2764502465724945,
"learning_rate": 0.00019163339131320718,
"loss": 1.4165,
"step": 393
},
{
"epoch": 0.061467657325610874,
"grad_norm": 0.23493847250938416,
"learning_rate": 0.00019158144058607708,
"loss": 1.1334,
"step": 394
},
{
"epoch": 0.06162366660816319,
"grad_norm": 0.2605098783969879,
"learning_rate": 0.00019152933615547798,
"loss": 1.1613,
"step": 395
},
{
"epoch": 0.0617796758907155,
"grad_norm": 0.23720701038837433,
"learning_rate": 0.000191477078108858,
"loss": 1.1966,
"step": 396
},
{
"epoch": 0.06193568517326781,
"grad_norm": 0.27043676376342773,
"learning_rate": 0.00019142466653392318,
"loss": 1.2793,
"step": 397
},
{
"epoch": 0.06209169445582012,
"grad_norm": 0.27630025148391724,
"learning_rate": 0.0001913721015186372,
"loss": 1.3858,
"step": 398
},
{
"epoch": 0.062247703738372434,
"grad_norm": 0.29454129934310913,
"learning_rate": 0.0001913193831512213,
"loss": 1.5234,
"step": 399
},
{
"epoch": 0.06240371302092475,
"grad_norm": 0.26943233609199524,
"learning_rate": 0.00019126651152015403,
"loss": 1.3181,
"step": 400
},
{
"epoch": 0.06255972230347706,
"grad_norm": 0.28831520676612854,
"learning_rate": 0.0001912134867141712,
"loss": 1.46,
"step": 401
},
{
"epoch": 0.06271573158602937,
"grad_norm": 0.26342567801475525,
"learning_rate": 0.0001911603088222657,
"loss": 1.4073,
"step": 402
},
{
"epoch": 0.06287174086858167,
"grad_norm": 0.2623300552368164,
"learning_rate": 0.0001911069779336873,
"loss": 1.3473,
"step": 403
},
{
"epoch": 0.06302775015113399,
"grad_norm": 0.25125861167907715,
"learning_rate": 0.00019105349413794272,
"loss": 1.0346,
"step": 404
},
{
"epoch": 0.0631837594336863,
"grad_norm": 0.30890092253685,
"learning_rate": 0.00019099985752479506,
"loss": 1.5751,
"step": 405
},
{
"epoch": 0.06333976871623861,
"grad_norm": 0.31404733657836914,
"learning_rate": 0.00019094606818426403,
"loss": 1.5458,
"step": 406
},
{
"epoch": 0.06349577799879093,
"grad_norm": 0.2684463858604431,
"learning_rate": 0.00019089212620662568,
"loss": 1.2342,
"step": 407
},
{
"epoch": 0.06365178728134324,
"grad_norm": 0.2748461365699768,
"learning_rate": 0.00019083803168241223,
"loss": 1.3353,
"step": 408
},
{
"epoch": 0.06380779656389556,
"grad_norm": 0.3061840832233429,
"learning_rate": 0.00019078378470241183,
"loss": 1.3197,
"step": 409
},
{
"epoch": 0.06396380584644787,
"grad_norm": 0.25601011514663696,
"learning_rate": 0.00019072938535766865,
"loss": 1.3904,
"step": 410
},
{
"epoch": 0.06411981512900018,
"grad_norm": 0.2844060957431793,
"learning_rate": 0.00019067483373948243,
"loss": 1.42,
"step": 411
},
{
"epoch": 0.06427582441155248,
"grad_norm": 0.2969295382499695,
"learning_rate": 0.00019062012993940859,
"loss": 1.4255,
"step": 412
},
{
"epoch": 0.0644318336941048,
"grad_norm": 0.2655050456523895,
"learning_rate": 0.00019056527404925789,
"loss": 1.1618,
"step": 413
},
{
"epoch": 0.06458784297665711,
"grad_norm": 0.2571544349193573,
"learning_rate": 0.00019051026616109638,
"loss": 1.2064,
"step": 414
},
{
"epoch": 0.06474385225920942,
"grad_norm": 0.29847028851509094,
"learning_rate": 0.0001904551063672452,
"loss": 1.2847,
"step": 415
},
{
"epoch": 0.06489986154176174,
"grad_norm": 0.24265627562999725,
"learning_rate": 0.00019039979476028043,
"loss": 1.2745,
"step": 416
},
{
"epoch": 0.06505587082431405,
"grad_norm": 0.24038730561733246,
"learning_rate": 0.000190344331433033,
"loss": 1.2761,
"step": 417
},
{
"epoch": 0.06521188010686636,
"grad_norm": 0.26194193959236145,
"learning_rate": 0.00019028871647858834,
"loss": 1.5021,
"step": 418
},
{
"epoch": 0.06536788938941868,
"grad_norm": 0.2636980712413788,
"learning_rate": 0.00019023294999028653,
"loss": 1.5029,
"step": 419
},
{
"epoch": 0.06552389867197098,
"grad_norm": 0.26995277404785156,
"learning_rate": 0.00019017703206172185,
"loss": 1.3068,
"step": 420
},
{
"epoch": 0.06567990795452329,
"grad_norm": 0.26835623383522034,
"learning_rate": 0.0001901209627867428,
"loss": 1.2868,
"step": 421
},
{
"epoch": 0.0658359172370756,
"grad_norm": 0.24785400927066803,
"learning_rate": 0.0001900647422594519,
"loss": 1.1875,
"step": 422
},
{
"epoch": 0.06599192651962792,
"grad_norm": 0.3184250593185425,
"learning_rate": 0.0001900083705742054,
"loss": 1.3802,
"step": 423
},
{
"epoch": 0.06614793580218023,
"grad_norm": 0.2850029766559601,
"learning_rate": 0.00018995184782561345,
"loss": 1.3043,
"step": 424
},
{
"epoch": 0.06630394508473254,
"grad_norm": 0.2940841317176819,
"learning_rate": 0.00018989517410853955,
"loss": 1.287,
"step": 425
},
{
"epoch": 0.06645995436728486,
"grad_norm": 0.2668844163417816,
"learning_rate": 0.0001898383495181007,
"loss": 1.3723,
"step": 426
},
{
"epoch": 0.06661596364983717,
"grad_norm": 0.2814147472381592,
"learning_rate": 0.00018978137414966698,
"loss": 1.2339,
"step": 427
},
{
"epoch": 0.06677197293238948,
"grad_norm": 0.3722403049468994,
"learning_rate": 0.0001897242480988617,
"loss": 1.2755,
"step": 428
},
{
"epoch": 0.06692798221494178,
"grad_norm": 0.2689428925514221,
"learning_rate": 0.00018966697146156092,
"loss": 1.4238,
"step": 429
},
{
"epoch": 0.0670839914974941,
"grad_norm": 0.29616808891296387,
"learning_rate": 0.00018960954433389345,
"loss": 1.3167,
"step": 430
},
{
"epoch": 0.06724000078004641,
"grad_norm": 0.2477925419807434,
"learning_rate": 0.0001895519668122408,
"loss": 1.1773,
"step": 431
},
{
"epoch": 0.06739601006259872,
"grad_norm": 0.23961544036865234,
"learning_rate": 0.0001894942389932367,
"loss": 1.1387,
"step": 432
},
{
"epoch": 0.06755201934515104,
"grad_norm": 0.26128751039505005,
"learning_rate": 0.00018943636097376726,
"loss": 1.0468,
"step": 433
},
{
"epoch": 0.06770802862770335,
"grad_norm": 0.33279022574424744,
"learning_rate": 0.00018937833285097066,
"loss": 1.8791,
"step": 434
},
{
"epoch": 0.06786403791025566,
"grad_norm": 0.2876769006252289,
"learning_rate": 0.00018932015472223693,
"loss": 1.3633,
"step": 435
},
{
"epoch": 0.06802004719280798,
"grad_norm": 0.24108922481536865,
"learning_rate": 0.00018926182668520792,
"loss": 1.2012,
"step": 436
},
{
"epoch": 0.06817605647536028,
"grad_norm": 0.29062169790267944,
"learning_rate": 0.0001892033488377771,
"loss": 1.3658,
"step": 437
},
{
"epoch": 0.06833206575791259,
"grad_norm": 0.26536259055137634,
"learning_rate": 0.0001891447212780893,
"loss": 1.2464,
"step": 438
},
{
"epoch": 0.0684880750404649,
"grad_norm": 0.2940811514854431,
"learning_rate": 0.0001890859441045407,
"loss": 1.4609,
"step": 439
},
{
"epoch": 0.06864408432301722,
"grad_norm": 0.27625903487205505,
"learning_rate": 0.0001890270174157784,
"loss": 1.4098,
"step": 440
},
{
"epoch": 0.06880009360556953,
"grad_norm": 0.2586573362350464,
"learning_rate": 0.00018896794131070073,
"loss": 1.3857,
"step": 441
},
{
"epoch": 0.06895610288812185,
"grad_norm": 0.28287774324417114,
"learning_rate": 0.0001889087158884565,
"loss": 1.2967,
"step": 442
},
{
"epoch": 0.06911211217067416,
"grad_norm": 0.2692122459411621,
"learning_rate": 0.00018884934124844532,
"loss": 1.5216,
"step": 443
},
{
"epoch": 0.06926812145322647,
"grad_norm": 0.3004090189933777,
"learning_rate": 0.00018878981749031716,
"loss": 1.1913,
"step": 444
},
{
"epoch": 0.06942413073577879,
"grad_norm": 0.253542423248291,
"learning_rate": 0.00018873014471397224,
"loss": 1.1299,
"step": 445
},
{
"epoch": 0.06958014001833109,
"grad_norm": 0.3034575283527374,
"learning_rate": 0.00018867032301956088,
"loss": 1.3577,
"step": 446
},
{
"epoch": 0.0697361493008834,
"grad_norm": 0.31302767992019653,
"learning_rate": 0.00018861035250748343,
"loss": 1.6029,
"step": 447
},
{
"epoch": 0.06989215858343571,
"grad_norm": 0.26993393898010254,
"learning_rate": 0.00018855023327838983,
"loss": 1.2035,
"step": 448
},
{
"epoch": 0.07004816786598803,
"grad_norm": 0.27148422598838806,
"learning_rate": 0.00018848996543317982,
"loss": 1.5843,
"step": 449
},
{
"epoch": 0.07020417714854034,
"grad_norm": 0.2631765305995941,
"learning_rate": 0.00018842954907300236,
"loss": 1.2641,
"step": 450
},
{
"epoch": 0.07036018643109265,
"grad_norm": 0.2621013820171356,
"learning_rate": 0.00018836898429925585,
"loss": 1.2167,
"step": 451
},
{
"epoch": 0.07051619571364497,
"grad_norm": 0.25064215064048767,
"learning_rate": 0.0001883082712135877,
"loss": 1.2631,
"step": 452
},
{
"epoch": 0.07067220499619728,
"grad_norm": 0.2558056712150574,
"learning_rate": 0.00018824740991789415,
"loss": 0.9964,
"step": 453
},
{
"epoch": 0.07082821427874958,
"grad_norm": 0.2675093412399292,
"learning_rate": 0.00018818640051432035,
"loss": 1.4953,
"step": 454
},
{
"epoch": 0.07098422356130189,
"grad_norm": 0.2550821006298065,
"learning_rate": 0.0001881252431052599,
"loss": 1.1283,
"step": 455
},
{
"epoch": 0.0711402328438542,
"grad_norm": 0.24893717467784882,
"learning_rate": 0.00018806393779335483,
"loss": 1.1725,
"step": 456
},
{
"epoch": 0.07129624212640652,
"grad_norm": 0.24471914768218994,
"learning_rate": 0.00018800248468149543,
"loss": 1.19,
"step": 457
},
{
"epoch": 0.07145225140895883,
"grad_norm": 0.27745166420936584,
"learning_rate": 0.00018794088387282,
"loss": 1.6347,
"step": 458
},
{
"epoch": 0.07160826069151115,
"grad_norm": 0.2930917739868164,
"learning_rate": 0.00018787913547071484,
"loss": 1.5139,
"step": 459
},
{
"epoch": 0.07176426997406346,
"grad_norm": 0.2656380534172058,
"learning_rate": 0.00018781723957881372,
"loss": 1.1726,
"step": 460
},
{
"epoch": 0.07192027925661577,
"grad_norm": 0.27983731031417847,
"learning_rate": 0.0001877551963009982,
"loss": 1.3818,
"step": 461
},
{
"epoch": 0.07207628853916809,
"grad_norm": 0.2744976580142975,
"learning_rate": 0.0001876930057413971,
"loss": 1.2756,
"step": 462
},
{
"epoch": 0.07223229782172039,
"grad_norm": 0.2684760093688965,
"learning_rate": 0.00018763066800438636,
"loss": 1.2302,
"step": 463
},
{
"epoch": 0.0723883071042727,
"grad_norm": 0.25079357624053955,
"learning_rate": 0.00018756818319458907,
"loss": 1.1575,
"step": 464
},
{
"epoch": 0.07254431638682501,
"grad_norm": 0.2802796959877014,
"learning_rate": 0.000187505551416875,
"loss": 1.3711,
"step": 465
},
{
"epoch": 0.07270032566937733,
"grad_norm": 0.7640414237976074,
"learning_rate": 0.0001874427727763607,
"loss": 1.3431,
"step": 466
},
{
"epoch": 0.07285633495192964,
"grad_norm": 0.265717089176178,
"learning_rate": 0.0001873798473784092,
"loss": 1.1778,
"step": 467
},
{
"epoch": 0.07301234423448195,
"grad_norm": 0.23273074626922607,
"learning_rate": 0.00018731677532862976,
"loss": 1.02,
"step": 468
},
{
"epoch": 0.07316835351703427,
"grad_norm": 0.248812735080719,
"learning_rate": 0.00018725355673287778,
"loss": 1.1423,
"step": 469
},
{
"epoch": 0.07332436279958658,
"grad_norm": 0.24919858574867249,
"learning_rate": 0.00018719019169725472,
"loss": 1.2377,
"step": 470
},
{
"epoch": 0.07348037208213888,
"grad_norm": 0.25503799319267273,
"learning_rate": 0.00018712668032810768,
"loss": 1.3236,
"step": 471
},
{
"epoch": 0.0736363813646912,
"grad_norm": 0.28893566131591797,
"learning_rate": 0.00018706302273202943,
"loss": 1.4662,
"step": 472
},
{
"epoch": 0.07379239064724351,
"grad_norm": 0.2384706735610962,
"learning_rate": 0.00018699921901585813,
"loss": 1.2817,
"step": 473
},
{
"epoch": 0.07394839992979582,
"grad_norm": 0.2527397572994232,
"learning_rate": 0.0001869352692866772,
"loss": 1.1766,
"step": 474
},
{
"epoch": 0.07410440921234814,
"grad_norm": 0.25340378284454346,
"learning_rate": 0.00018687117365181512,
"loss": 1.1876,
"step": 475
},
{
"epoch": 0.07426041849490045,
"grad_norm": 0.2570219039916992,
"learning_rate": 0.00018680693221884517,
"loss": 1.3472,
"step": 476
},
{
"epoch": 0.07441642777745276,
"grad_norm": 0.25267085433006287,
"learning_rate": 0.00018674254509558544,
"loss": 1.5048,
"step": 477
},
{
"epoch": 0.07457243706000508,
"grad_norm": 0.24603790044784546,
"learning_rate": 0.00018667801239009846,
"loss": 1.276,
"step": 478
},
{
"epoch": 0.07472844634255738,
"grad_norm": 0.2434520423412323,
"learning_rate": 0.00018661333421069113,
"loss": 1.3999,
"step": 479
},
{
"epoch": 0.07488445562510969,
"grad_norm": 0.27032792568206787,
"learning_rate": 0.00018654851066591448,
"loss": 1.3909,
"step": 480
},
{
"epoch": 0.075040464907662,
"grad_norm": 0.26559844613075256,
"learning_rate": 0.00018648354186456348,
"loss": 1.2931,
"step": 481
},
{
"epoch": 0.07519647419021432,
"grad_norm": 0.2563202679157257,
"learning_rate": 0.000186418427915677,
"loss": 1.2773,
"step": 482
},
{
"epoch": 0.07535248347276663,
"grad_norm": 0.2463751882314682,
"learning_rate": 0.00018635316892853741,
"loss": 1.4017,
"step": 483
},
{
"epoch": 0.07550849275531894,
"grad_norm": 0.26452189683914185,
"learning_rate": 0.00018628776501267052,
"loss": 1.2236,
"step": 484
},
{
"epoch": 0.07566450203787126,
"grad_norm": 0.48540955781936646,
"learning_rate": 0.0001862222162778454,
"loss": 1.1676,
"step": 485
},
{
"epoch": 0.07582051132042357,
"grad_norm": 0.2931404411792755,
"learning_rate": 0.0001861565228340742,
"loss": 1.3877,
"step": 486
},
{
"epoch": 0.07597652060297588,
"grad_norm": 0.2707270383834839,
"learning_rate": 0.00018609068479161182,
"loss": 1.2828,
"step": 487
},
{
"epoch": 0.07613252988552818,
"grad_norm": 0.25902295112609863,
"learning_rate": 0.00018602470226095603,
"loss": 1.2393,
"step": 488
},
{
"epoch": 0.0762885391680805,
"grad_norm": 0.27907291054725647,
"learning_rate": 0.00018595857535284692,
"loss": 1.1944,
"step": 489
},
{
"epoch": 0.07644454845063281,
"grad_norm": 0.3079850375652313,
"learning_rate": 0.00018589230417826697,
"loss": 1.3686,
"step": 490
},
{
"epoch": 0.07660055773318512,
"grad_norm": 0.250303715467453,
"learning_rate": 0.00018582588884844084,
"loss": 1.2497,
"step": 491
},
{
"epoch": 0.07675656701573744,
"grad_norm": 0.260257750749588,
"learning_rate": 0.00018575932947483502,
"loss": 1.4186,
"step": 492
},
{
"epoch": 0.07691257629828975,
"grad_norm": 0.2537723481655121,
"learning_rate": 0.00018569262616915784,
"loss": 1.28,
"step": 493
},
{
"epoch": 0.07706858558084206,
"grad_norm": 0.21861004829406738,
"learning_rate": 0.00018562577904335912,
"loss": 0.9705,
"step": 494
},
{
"epoch": 0.07722459486339438,
"grad_norm": 0.322566956281662,
"learning_rate": 0.00018555878820963013,
"loss": 1.4941,
"step": 495
},
{
"epoch": 0.07738060414594668,
"grad_norm": 0.24904873967170715,
"learning_rate": 0.00018549165378040327,
"loss": 1.2277,
"step": 496
},
{
"epoch": 0.07753661342849899,
"grad_norm": 0.2692057490348816,
"learning_rate": 0.00018542437586835202,
"loss": 1.3786,
"step": 497
},
{
"epoch": 0.0776926227110513,
"grad_norm": 0.27876508235931396,
"learning_rate": 0.00018535695458639056,
"loss": 1.3822,
"step": 498
},
{
"epoch": 0.07784863199360362,
"grad_norm": 0.2497495859861374,
"learning_rate": 0.00018528939004767376,
"loss": 1.1872,
"step": 499
},
{
"epoch": 0.07800464127615593,
"grad_norm": 0.28155678510665894,
"learning_rate": 0.00018522168236559695,
"loss": 1.2253,
"step": 500
},
{
"epoch": 0.07800464127615593,
"eval_loss": 1.3168833255767822,
"eval_runtime": 110.9584,
"eval_samples_per_second": 38.51,
"eval_steps_per_second": 4.822,
"step": 500
},
{
"epoch": 0.07816065055870824,
"grad_norm": 0.25162461400032043,
"learning_rate": 0.0001851538316537956,
"loss": 1.2308,
"step": 501
},
{
"epoch": 0.07831665984126056,
"grad_norm": 0.33541133999824524,
"learning_rate": 0.0001850858380261453,
"loss": 1.2788,
"step": 502
},
{
"epoch": 0.07847266912381287,
"grad_norm": 0.29069721698760986,
"learning_rate": 0.00018501770159676156,
"loss": 1.4186,
"step": 503
},
{
"epoch": 0.07862867840636519,
"grad_norm": 0.24337412416934967,
"learning_rate": 0.0001849494224799994,
"loss": 1.2268,
"step": 504
},
{
"epoch": 0.07878468768891748,
"grad_norm": 0.2503622770309448,
"learning_rate": 0.00018488100079045344,
"loss": 1.1121,
"step": 505
},
{
"epoch": 0.0789406969714698,
"grad_norm": 0.3061240017414093,
"learning_rate": 0.0001848124366429576,
"loss": 1.4207,
"step": 506
},
{
"epoch": 0.07909670625402211,
"grad_norm": 0.3209320902824402,
"learning_rate": 0.00018474373015258473,
"loss": 1.3531,
"step": 507
},
{
"epoch": 0.07925271553657443,
"grad_norm": 0.26510298252105713,
"learning_rate": 0.0001846748814346468,
"loss": 1.1614,
"step": 508
},
{
"epoch": 0.07940872481912674,
"grad_norm": 0.24753335118293762,
"learning_rate": 0.00018460589060469425,
"loss": 1.2711,
"step": 509
},
{
"epoch": 0.07956473410167905,
"grad_norm": 0.2837298512458801,
"learning_rate": 0.00018453675777851627,
"loss": 1.2325,
"step": 510
},
{
"epoch": 0.07972074338423137,
"grad_norm": 0.30447372794151306,
"learning_rate": 0.00018446748307214019,
"loss": 1.2425,
"step": 511
},
{
"epoch": 0.07987675266678368,
"grad_norm": 0.27281391620635986,
"learning_rate": 0.0001843980666018315,
"loss": 1.3095,
"step": 512
},
{
"epoch": 0.08003276194933598,
"grad_norm": 0.27750325202941895,
"learning_rate": 0.00018432850848409363,
"loss": 1.5124,
"step": 513
},
{
"epoch": 0.08018877123188829,
"grad_norm": 0.32551145553588867,
"learning_rate": 0.00018425880883566782,
"loss": 1.5727,
"step": 514
},
{
"epoch": 0.0803447805144406,
"grad_norm": 0.29455453157424927,
"learning_rate": 0.0001841889677735327,
"loss": 1.1937,
"step": 515
},
{
"epoch": 0.08050078979699292,
"grad_norm": 0.271435022354126,
"learning_rate": 0.00018411898541490434,
"loss": 1.3523,
"step": 516
},
{
"epoch": 0.08065679907954523,
"grad_norm": 0.28192776441574097,
"learning_rate": 0.0001840488618772359,
"loss": 1.4196,
"step": 517
},
{
"epoch": 0.08081280836209755,
"grad_norm": 0.32622769474983215,
"learning_rate": 0.00018397859727821748,
"loss": 1.3939,
"step": 518
},
{
"epoch": 0.08096881764464986,
"grad_norm": 0.26916465163230896,
"learning_rate": 0.00018390819173577598,
"loss": 1.315,
"step": 519
},
{
"epoch": 0.08112482692720217,
"grad_norm": 0.2807716429233551,
"learning_rate": 0.00018383764536807485,
"loss": 1.4009,
"step": 520
},
{
"epoch": 0.08128083620975449,
"grad_norm": 0.2609405517578125,
"learning_rate": 0.00018376695829351377,
"loss": 0.9599,
"step": 521
},
{
"epoch": 0.08143684549230679,
"grad_norm": 0.27300071716308594,
"learning_rate": 0.00018369613063072874,
"loss": 1.2349,
"step": 522
},
{
"epoch": 0.0815928547748591,
"grad_norm": 0.26670917868614197,
"learning_rate": 0.00018362516249859163,
"loss": 1.2895,
"step": 523
},
{
"epoch": 0.08174886405741141,
"grad_norm": 0.2805304527282715,
"learning_rate": 0.00018355405401621001,
"loss": 1.3661,
"step": 524
},
{
"epoch": 0.08190487333996373,
"grad_norm": 0.25124502182006836,
"learning_rate": 0.00018348280530292713,
"loss": 1.2215,
"step": 525
},
{
"epoch": 0.08206088262251604,
"grad_norm": 0.2374117225408554,
"learning_rate": 0.00018341141647832147,
"loss": 1.1662,
"step": 526
},
{
"epoch": 0.08221689190506835,
"grad_norm": 0.2681942582130432,
"learning_rate": 0.00018333988766220676,
"loss": 1.3256,
"step": 527
},
{
"epoch": 0.08237290118762067,
"grad_norm": 0.26264506578445435,
"learning_rate": 0.0001832682189746316,
"loss": 1.1417,
"step": 528
},
{
"epoch": 0.08252891047017298,
"grad_norm": 0.2661115527153015,
"learning_rate": 0.00018319641053587938,
"loss": 1.2202,
"step": 529
},
{
"epoch": 0.08268491975272528,
"grad_norm": 0.23459146916866302,
"learning_rate": 0.0001831244624664681,
"loss": 1.0511,
"step": 530
},
{
"epoch": 0.0828409290352776,
"grad_norm": 0.31903690099716187,
"learning_rate": 0.00018305237488714995,
"loss": 1.565,
"step": 531
},
{
"epoch": 0.08299693831782991,
"grad_norm": 0.28528186678886414,
"learning_rate": 0.00018298014791891137,
"loss": 1.5023,
"step": 532
},
{
"epoch": 0.08315294760038222,
"grad_norm": 0.2572003901004791,
"learning_rate": 0.00018290778168297277,
"loss": 1.1518,
"step": 533
},
{
"epoch": 0.08330895688293453,
"grad_norm": 0.27797260880470276,
"learning_rate": 0.00018283527630078825,
"loss": 1.344,
"step": 534
},
{
"epoch": 0.08346496616548685,
"grad_norm": 0.3142591416835785,
"learning_rate": 0.0001827626318940454,
"loss": 1.4126,
"step": 535
},
{
"epoch": 0.08362097544803916,
"grad_norm": 0.2703491151332855,
"learning_rate": 0.00018268984858466522,
"loss": 1.2156,
"step": 536
},
{
"epoch": 0.08377698473059147,
"grad_norm": 0.29505112767219543,
"learning_rate": 0.00018261692649480175,
"loss": 1.421,
"step": 537
},
{
"epoch": 0.08393299401314379,
"grad_norm": 0.2756875157356262,
"learning_rate": 0.00018254386574684204,
"loss": 1.4858,
"step": 538
},
{
"epoch": 0.08408900329569609,
"grad_norm": 0.2744990885257721,
"learning_rate": 0.0001824706664634058,
"loss": 1.3441,
"step": 539
},
{
"epoch": 0.0842450125782484,
"grad_norm": 0.2834165096282959,
"learning_rate": 0.00018239732876734527,
"loss": 1.4142,
"step": 540
},
{
"epoch": 0.08440102186080072,
"grad_norm": 0.2717669904232025,
"learning_rate": 0.0001823238527817449,
"loss": 1.3199,
"step": 541
},
{
"epoch": 0.08455703114335303,
"grad_norm": 0.26433441042900085,
"learning_rate": 0.00018225023862992142,
"loss": 1.3197,
"step": 542
},
{
"epoch": 0.08471304042590534,
"grad_norm": 0.27460265159606934,
"learning_rate": 0.00018217648643542323,
"loss": 1.216,
"step": 543
},
{
"epoch": 0.08486904970845766,
"grad_norm": 0.26642194390296936,
"learning_rate": 0.0001821025963220306,
"loss": 1.1716,
"step": 544
},
{
"epoch": 0.08502505899100997,
"grad_norm": 0.2999640703201294,
"learning_rate": 0.00018202856841375518,
"loss": 1.394,
"step": 545
},
{
"epoch": 0.08518106827356228,
"grad_norm": 0.2676008641719818,
"learning_rate": 0.00018195440283483988,
"loss": 1.2725,
"step": 546
},
{
"epoch": 0.08533707755611458,
"grad_norm": 0.26116111874580383,
"learning_rate": 0.0001818800997097587,
"loss": 1.329,
"step": 547
},
{
"epoch": 0.0854930868386669,
"grad_norm": 0.26923874020576477,
"learning_rate": 0.00018180565916321647,
"loss": 1.2228,
"step": 548
},
{
"epoch": 0.08564909612121921,
"grad_norm": 0.2784603536128998,
"learning_rate": 0.0001817310813201486,
"loss": 1.1249,
"step": 549
},
{
"epoch": 0.08580510540377152,
"grad_norm": 0.27981552481651306,
"learning_rate": 0.0001816563663057211,
"loss": 1.2778,
"step": 550
},
{
"epoch": 0.08596111468632384,
"grad_norm": 0.2464422732591629,
"learning_rate": 0.00018158151424533002,
"loss": 1.0316,
"step": 551
},
{
"epoch": 0.08611712396887615,
"grad_norm": 0.23159442842006683,
"learning_rate": 0.00018150652526460146,
"loss": 0.9794,
"step": 552
},
{
"epoch": 0.08627313325142846,
"grad_norm": 0.28374752402305603,
"learning_rate": 0.00018143139948939137,
"loss": 1.0572,
"step": 553
},
{
"epoch": 0.08642914253398078,
"grad_norm": 0.28464943170547485,
"learning_rate": 0.00018135613704578526,
"loss": 1.024,
"step": 554
},
{
"epoch": 0.08658515181653309,
"grad_norm": 0.23248714208602905,
"learning_rate": 0.000181280738060098,
"loss": 0.9151,
"step": 555
},
{
"epoch": 0.08674116109908539,
"grad_norm": 0.2613517940044403,
"learning_rate": 0.00018120520265887363,
"loss": 1.2155,
"step": 556
},
{
"epoch": 0.0868971703816377,
"grad_norm": 0.2925867438316345,
"learning_rate": 0.00018112953096888516,
"loss": 1.2136,
"step": 557
},
{
"epoch": 0.08705317966419002,
"grad_norm": 0.3145943582057953,
"learning_rate": 0.00018105372311713432,
"loss": 1.4368,
"step": 558
},
{
"epoch": 0.08720918894674233,
"grad_norm": 0.29513052105903625,
"learning_rate": 0.0001809777792308513,
"loss": 1.4516,
"step": 559
},
{
"epoch": 0.08736519822929464,
"grad_norm": 0.22099293768405914,
"learning_rate": 0.00018090169943749476,
"loss": 1.0234,
"step": 560
},
{
"epoch": 0.08752120751184696,
"grad_norm": 0.24346297979354858,
"learning_rate": 0.0001808254838647513,
"loss": 1.3492,
"step": 561
},
{
"epoch": 0.08767721679439927,
"grad_norm": 0.2770818769931793,
"learning_rate": 0.00018074913264053545,
"loss": 1.4692,
"step": 562
},
{
"epoch": 0.08783322607695158,
"grad_norm": 0.2789641320705414,
"learning_rate": 0.00018067264589298945,
"loss": 1.3942,
"step": 563
},
{
"epoch": 0.08798923535950388,
"grad_norm": 0.2892186939716339,
"learning_rate": 0.00018059602375048293,
"loss": 1.3621,
"step": 564
},
{
"epoch": 0.0881452446420562,
"grad_norm": 0.28431588411331177,
"learning_rate": 0.00018051926634161282,
"loss": 1.3073,
"step": 565
},
{
"epoch": 0.08830125392460851,
"grad_norm": 0.3204723000526428,
"learning_rate": 0.00018044237379520305,
"loss": 1.8154,
"step": 566
},
{
"epoch": 0.08845726320716082,
"grad_norm": 0.2658674716949463,
"learning_rate": 0.0001803653462403043,
"loss": 1.1807,
"step": 567
},
{
"epoch": 0.08861327248971314,
"grad_norm": 0.2409079521894455,
"learning_rate": 0.0001802881838061939,
"loss": 1.2165,
"step": 568
},
{
"epoch": 0.08876928177226545,
"grad_norm": 0.25896573066711426,
"learning_rate": 0.00018021088662237552,
"loss": 1.1993,
"step": 569
},
{
"epoch": 0.08892529105481776,
"grad_norm": 0.27663204073905945,
"learning_rate": 0.00018013345481857903,
"loss": 1.1241,
"step": 570
},
{
"epoch": 0.08908130033737008,
"grad_norm": 0.2892790734767914,
"learning_rate": 0.00018005588852476015,
"loss": 1.6163,
"step": 571
},
{
"epoch": 0.08923730961992239,
"grad_norm": 0.30898550152778625,
"learning_rate": 0.00017997818787110042,
"loss": 1.2483,
"step": 572
},
{
"epoch": 0.08939331890247469,
"grad_norm": 0.23732271790504456,
"learning_rate": 0.0001799003529880068,
"loss": 1.1204,
"step": 573
},
{
"epoch": 0.089549328185027,
"grad_norm": 0.2597337067127228,
"learning_rate": 0.0001798223840061116,
"loss": 1.258,
"step": 574
},
{
"epoch": 0.08970533746757932,
"grad_norm": 0.31342512369155884,
"learning_rate": 0.00017974428105627208,
"loss": 1.4074,
"step": 575
},
{
"epoch": 0.08986134675013163,
"grad_norm": 0.30252331495285034,
"learning_rate": 0.00017966604426957047,
"loss": 1.2059,
"step": 576
},
{
"epoch": 0.09001735603268395,
"grad_norm": 0.29326415061950684,
"learning_rate": 0.00017958767377731358,
"loss": 1.4294,
"step": 577
},
{
"epoch": 0.09017336531523626,
"grad_norm": 0.2915484607219696,
"learning_rate": 0.00017950916971103259,
"loss": 1.3728,
"step": 578
},
{
"epoch": 0.09032937459778857,
"grad_norm": 0.2966526746749878,
"learning_rate": 0.00017943053220248283,
"loss": 1.5332,
"step": 579
},
{
"epoch": 0.09048538388034089,
"grad_norm": 0.24311012029647827,
"learning_rate": 0.0001793517613836437,
"loss": 1.1254,
"step": 580
},
{
"epoch": 0.09064139316289319,
"grad_norm": 0.2950594127178192,
"learning_rate": 0.00017927285738671825,
"loss": 1.7255,
"step": 581
},
{
"epoch": 0.0907974024454455,
"grad_norm": 0.24679097533226013,
"learning_rate": 0.00017919382034413305,
"loss": 1.2781,
"step": 582
},
{
"epoch": 0.09095341172799781,
"grad_norm": 0.2747292220592499,
"learning_rate": 0.00017911465038853805,
"loss": 1.3434,
"step": 583
},
{
"epoch": 0.09110942101055013,
"grad_norm": 0.30099523067474365,
"learning_rate": 0.00017903534765280614,
"loss": 1.4518,
"step": 584
},
{
"epoch": 0.09126543029310244,
"grad_norm": 0.2866073548793793,
"learning_rate": 0.00017895591227003315,
"loss": 1.1706,
"step": 585
},
{
"epoch": 0.09142143957565475,
"grad_norm": 0.28832805156707764,
"learning_rate": 0.00017887634437353754,
"loss": 1.2271,
"step": 586
},
{
"epoch": 0.09157744885820707,
"grad_norm": 0.3714962601661682,
"learning_rate": 0.00017879664409686008,
"loss": 1.4474,
"step": 587
},
{
"epoch": 0.09173345814075938,
"grad_norm": 0.30591243505477905,
"learning_rate": 0.00017871681157376383,
"loss": 1.0327,
"step": 588
},
{
"epoch": 0.0918894674233117,
"grad_norm": 0.3032775819301605,
"learning_rate": 0.00017863684693823374,
"loss": 1.6824,
"step": 589
},
{
"epoch": 0.092045476705864,
"grad_norm": 0.26961666345596313,
"learning_rate": 0.00017855675032447648,
"loss": 1.1249,
"step": 590
},
{
"epoch": 0.0922014859884163,
"grad_norm": 0.2679152488708496,
"learning_rate": 0.00017847652186692026,
"loss": 1.2182,
"step": 591
},
{
"epoch": 0.09235749527096862,
"grad_norm": 0.24089114367961884,
"learning_rate": 0.00017839616170021452,
"loss": 1.1095,
"step": 592
},
{
"epoch": 0.09251350455352093,
"grad_norm": 0.25100457668304443,
"learning_rate": 0.00017831566995922985,
"loss": 1.1441,
"step": 593
},
{
"epoch": 0.09266951383607325,
"grad_norm": 0.2766099274158478,
"learning_rate": 0.0001782350467790575,
"loss": 1.1893,
"step": 594
},
{
"epoch": 0.09282552311862556,
"grad_norm": 0.2666013240814209,
"learning_rate": 0.00017815429229500946,
"loss": 1.1802,
"step": 595
},
{
"epoch": 0.09298153240117787,
"grad_norm": 0.28148403763771057,
"learning_rate": 0.00017807340664261802,
"loss": 1.3232,
"step": 596
},
{
"epoch": 0.09313754168373019,
"grad_norm": 0.23684674501419067,
"learning_rate": 0.00017799238995763568,
"loss": 1.1869,
"step": 597
},
{
"epoch": 0.09329355096628249,
"grad_norm": 0.2614571154117584,
"learning_rate": 0.00017791124237603477,
"loss": 1.4023,
"step": 598
},
{
"epoch": 0.0934495602488348,
"grad_norm": 0.3051559329032898,
"learning_rate": 0.00017782996403400736,
"loss": 1.407,
"step": 599
},
{
"epoch": 0.09360556953138711,
"grad_norm": 0.2745681405067444,
"learning_rate": 0.00017774855506796496,
"loss": 1.3265,
"step": 600
},
{
"epoch": 0.09376157881393943,
"grad_norm": 0.2689257860183716,
"learning_rate": 0.0001776670156145383,
"loss": 1.3046,
"step": 601
},
{
"epoch": 0.09391758809649174,
"grad_norm": 0.29333195090293884,
"learning_rate": 0.00017758534581057718,
"loss": 1.2624,
"step": 602
},
{
"epoch": 0.09407359737904405,
"grad_norm": 0.30287420749664307,
"learning_rate": 0.00017750354579315004,
"loss": 1.28,
"step": 603
},
{
"epoch": 0.09422960666159637,
"grad_norm": 0.27796801924705505,
"learning_rate": 0.00017742161569954398,
"loss": 1.3305,
"step": 604
},
{
"epoch": 0.09438561594414868,
"grad_norm": 0.2703540325164795,
"learning_rate": 0.0001773395556672644,
"loss": 1.4356,
"step": 605
},
{
"epoch": 0.094541625226701,
"grad_norm": 0.26395589113235474,
"learning_rate": 0.0001772573658340347,
"loss": 1.1984,
"step": 606
},
{
"epoch": 0.0946976345092533,
"grad_norm": 0.2784560024738312,
"learning_rate": 0.0001771750463377962,
"loss": 1.3625,
"step": 607
},
{
"epoch": 0.09485364379180561,
"grad_norm": 0.31962451338768005,
"learning_rate": 0.00017709259731670774,
"loss": 1.3956,
"step": 608
},
{
"epoch": 0.09500965307435792,
"grad_norm": 0.274460107088089,
"learning_rate": 0.00017701001890914572,
"loss": 1.3071,
"step": 609
},
{
"epoch": 0.09516566235691024,
"grad_norm": 0.25924167037010193,
"learning_rate": 0.00017692731125370354,
"loss": 1.034,
"step": 610
},
{
"epoch": 0.09532167163946255,
"grad_norm": 0.3091680705547333,
"learning_rate": 0.00017684447448919154,
"loss": 1.4134,
"step": 611
},
{
"epoch": 0.09547768092201486,
"grad_norm": 0.25753480195999146,
"learning_rate": 0.00017676150875463686,
"loss": 1.2074,
"step": 612
},
{
"epoch": 0.09563369020456718,
"grad_norm": 0.27256032824516296,
"learning_rate": 0.0001766784141892829,
"loss": 1.3758,
"step": 613
},
{
"epoch": 0.09578969948711949,
"grad_norm": 0.24764277040958405,
"learning_rate": 0.0001765951909325895,
"loss": 1.0436,
"step": 614
},
{
"epoch": 0.09594570876967179,
"grad_norm": 0.2722652554512024,
"learning_rate": 0.00017651183912423228,
"loss": 1.3623,
"step": 615
},
{
"epoch": 0.0961017180522241,
"grad_norm": 0.27056217193603516,
"learning_rate": 0.0001764283589041028,
"loss": 1.2525,
"step": 616
},
{
"epoch": 0.09625772733477642,
"grad_norm": 0.27987945079803467,
"learning_rate": 0.00017634475041230797,
"loss": 1.5075,
"step": 617
},
{
"epoch": 0.09641373661732873,
"grad_norm": 0.29397958517074585,
"learning_rate": 0.00017626101378917004,
"loss": 1.3681,
"step": 618
},
{
"epoch": 0.09656974589988104,
"grad_norm": 0.2876337766647339,
"learning_rate": 0.0001761771491752264,
"loss": 1.5848,
"step": 619
},
{
"epoch": 0.09672575518243336,
"grad_norm": 0.237448051571846,
"learning_rate": 0.0001760931567112291,
"loss": 1.0918,
"step": 620
},
{
"epoch": 0.09688176446498567,
"grad_norm": 0.29513096809387207,
"learning_rate": 0.0001760090365381449,
"loss": 1.3236,
"step": 621
},
{
"epoch": 0.09703777374753798,
"grad_norm": 0.263920396566391,
"learning_rate": 0.0001759247887971548,
"loss": 1.4573,
"step": 622
},
{
"epoch": 0.0971937830300903,
"grad_norm": 0.31876271963119507,
"learning_rate": 0.00017584041362965396,
"loss": 1.3874,
"step": 623
},
{
"epoch": 0.0973497923126426,
"grad_norm": 0.30635690689086914,
"learning_rate": 0.0001757559111772513,
"loss": 1.2355,
"step": 624
},
{
"epoch": 0.09750580159519491,
"grad_norm": 0.25926241278648376,
"learning_rate": 0.00017567128158176953,
"loss": 1.2641,
"step": 625
},
{
"epoch": 0.09766181087774722,
"grad_norm": 0.2862091660499573,
"learning_rate": 0.0001755865249852446,
"loss": 1.3818,
"step": 626
},
{
"epoch": 0.09781782016029954,
"grad_norm": 0.2540535628795624,
"learning_rate": 0.00017550164152992573,
"loss": 1.3807,
"step": 627
},
{
"epoch": 0.09797382944285185,
"grad_norm": 0.30917900800704956,
"learning_rate": 0.00017541663135827492,
"loss": 1.1053,
"step": 628
},
{
"epoch": 0.09812983872540416,
"grad_norm": 0.30465036630630493,
"learning_rate": 0.000175331494612967,
"loss": 1.4489,
"step": 629
},
{
"epoch": 0.09828584800795648,
"grad_norm": 0.3043782711029053,
"learning_rate": 0.00017524623143688902,
"loss": 1.4544,
"step": 630
},
{
"epoch": 0.09844185729050879,
"grad_norm": 0.2681322991847992,
"learning_rate": 0.00017516084197314046,
"loss": 1.1926,
"step": 631
},
{
"epoch": 0.09859786657306109,
"grad_norm": 0.33450305461883545,
"learning_rate": 0.00017507532636503256,
"loss": 1.4383,
"step": 632
},
{
"epoch": 0.0987538758556134,
"grad_norm": 0.2626807987689972,
"learning_rate": 0.00017498968475608838,
"loss": 1.1565,
"step": 633
},
{
"epoch": 0.09890988513816572,
"grad_norm": 0.2553156912326813,
"learning_rate": 0.00017490391729004244,
"loss": 1.1327,
"step": 634
},
{
"epoch": 0.09906589442071803,
"grad_norm": 0.23390045762062073,
"learning_rate": 0.00017481802411084042,
"loss": 0.9856,
"step": 635
},
{
"epoch": 0.09922190370327034,
"grad_norm": 0.29881760478019714,
"learning_rate": 0.00017473200536263905,
"loss": 1.362,
"step": 636
},
{
"epoch": 0.09937791298582266,
"grad_norm": 0.2904150187969208,
"learning_rate": 0.0001746458611898058,
"loss": 1.242,
"step": 637
},
{
"epoch": 0.09953392226837497,
"grad_norm": 0.24842409789562225,
"learning_rate": 0.00017455959173691863,
"loss": 1.2694,
"step": 638
},
{
"epoch": 0.09968993155092729,
"grad_norm": 0.3337212800979614,
"learning_rate": 0.00017447319714876579,
"loss": 1.2554,
"step": 639
},
{
"epoch": 0.0998459408334796,
"grad_norm": 0.24105407297611237,
"learning_rate": 0.00017438667757034546,
"loss": 1.0582,
"step": 640
},
{
"epoch": 0.1000019501160319,
"grad_norm": 0.24266989529132843,
"learning_rate": 0.00017430003314686569,
"loss": 1.2125,
"step": 641
},
{
"epoch": 0.10015795939858421,
"grad_norm": 0.2654808461666107,
"learning_rate": 0.00017421326402374405,
"loss": 1.3229,
"step": 642
},
{
"epoch": 0.10031396868113653,
"grad_norm": 0.21931445598602295,
"learning_rate": 0.00017412637034660734,
"loss": 1.1168,
"step": 643
},
{
"epoch": 0.10046997796368884,
"grad_norm": 0.28860512375831604,
"learning_rate": 0.0001740393522612915,
"loss": 1.3681,
"step": 644
},
{
"epoch": 0.10062598724624115,
"grad_norm": 0.2736460566520691,
"learning_rate": 0.0001739522099138411,
"loss": 1.4054,
"step": 645
},
{
"epoch": 0.10078199652879347,
"grad_norm": 0.23222267627716064,
"learning_rate": 0.00017386494345050942,
"loss": 1.0973,
"step": 646
},
{
"epoch": 0.10093800581134578,
"grad_norm": 0.2684474587440491,
"learning_rate": 0.000173777553017758,
"loss": 1.0637,
"step": 647
},
{
"epoch": 0.10109401509389809,
"grad_norm": 0.2648880183696747,
"learning_rate": 0.00017369003876225642,
"loss": 1.5162,
"step": 648
},
{
"epoch": 0.10125002437645039,
"grad_norm": 0.26263687014579773,
"learning_rate": 0.00017360240083088213,
"loss": 1.3613,
"step": 649
},
{
"epoch": 0.1014060336590027,
"grad_norm": 0.2455459088087082,
"learning_rate": 0.00017351463937072004,
"loss": 1.3927,
"step": 650
},
{
"epoch": 0.10156204294155502,
"grad_norm": 0.273078590631485,
"learning_rate": 0.00017342675452906248,
"loss": 1.2485,
"step": 651
},
{
"epoch": 0.10171805222410733,
"grad_norm": 0.24480541050434113,
"learning_rate": 0.00017333874645340884,
"loss": 1.0656,
"step": 652
},
{
"epoch": 0.10187406150665965,
"grad_norm": 0.24994470179080963,
"learning_rate": 0.0001732506152914653,
"loss": 1.3653,
"step": 653
},
{
"epoch": 0.10203007078921196,
"grad_norm": 0.26110485196113586,
"learning_rate": 0.00017316236119114463,
"loss": 1.392,
"step": 654
},
{
"epoch": 0.10218608007176427,
"grad_norm": 0.30197709798812866,
"learning_rate": 0.00017307398430056593,
"loss": 1.5184,
"step": 655
},
{
"epoch": 0.10234208935431659,
"grad_norm": 0.26577743887901306,
"learning_rate": 0.00017298548476805446,
"loss": 1.4611,
"step": 656
},
{
"epoch": 0.10249809863686889,
"grad_norm": 0.2677333950996399,
"learning_rate": 0.00017289686274214118,
"loss": 1.3282,
"step": 657
},
{
"epoch": 0.1026541079194212,
"grad_norm": 0.2508523762226105,
"learning_rate": 0.00017280811837156268,
"loss": 1.1331,
"step": 658
},
{
"epoch": 0.10281011720197351,
"grad_norm": 0.24873429536819458,
"learning_rate": 0.00017271925180526094,
"loss": 1.1351,
"step": 659
},
{
"epoch": 0.10296612648452583,
"grad_norm": 0.2559413015842438,
"learning_rate": 0.00017263026319238301,
"loss": 1.245,
"step": 660
},
{
"epoch": 0.10312213576707814,
"grad_norm": 0.29988738894462585,
"learning_rate": 0.0001725411526822807,
"loss": 1.4004,
"step": 661
},
{
"epoch": 0.10327814504963045,
"grad_norm": 0.29719191789627075,
"learning_rate": 0.0001724519204245105,
"loss": 1.5687,
"step": 662
},
{
"epoch": 0.10343415433218277,
"grad_norm": 0.30810216069221497,
"learning_rate": 0.0001723625665688331,
"loss": 1.3712,
"step": 663
},
{
"epoch": 0.10359016361473508,
"grad_norm": 0.2754259407520294,
"learning_rate": 0.00017227309126521348,
"loss": 1.2083,
"step": 664
},
{
"epoch": 0.1037461728972874,
"grad_norm": 0.26548734307289124,
"learning_rate": 0.00017218349466382023,
"loss": 1.2657,
"step": 665
},
{
"epoch": 0.1039021821798397,
"grad_norm": 0.26369354128837585,
"learning_rate": 0.00017209377691502565,
"loss": 1.3359,
"step": 666
},
{
"epoch": 0.10405819146239201,
"grad_norm": 0.2526211440563202,
"learning_rate": 0.0001720039381694053,
"loss": 1.0633,
"step": 667
},
{
"epoch": 0.10421420074494432,
"grad_norm": 0.2874252498149872,
"learning_rate": 0.00017191397857773788,
"loss": 1.2833,
"step": 668
},
{
"epoch": 0.10437021002749663,
"grad_norm": 0.26982390880584717,
"learning_rate": 0.00017182389829100485,
"loss": 1.1843,
"step": 669
},
{
"epoch": 0.10452621931004895,
"grad_norm": 0.29615074396133423,
"learning_rate": 0.00017173369746039025,
"loss": 1.2992,
"step": 670
},
{
"epoch": 0.10468222859260126,
"grad_norm": 0.29073938727378845,
"learning_rate": 0.00017164337623728045,
"loss": 1.5432,
"step": 671
},
{
"epoch": 0.10483823787515358,
"grad_norm": 0.2858506143093109,
"learning_rate": 0.00017155293477326384,
"loss": 1.4446,
"step": 672
},
{
"epoch": 0.10499424715770589,
"grad_norm": 0.2399512678384781,
"learning_rate": 0.00017146237322013068,
"loss": 1.1643,
"step": 673
},
{
"epoch": 0.10515025644025819,
"grad_norm": 0.2796498239040375,
"learning_rate": 0.00017137169172987268,
"loss": 1.3158,
"step": 674
},
{
"epoch": 0.1053062657228105,
"grad_norm": 0.26859599351882935,
"learning_rate": 0.00017128089045468294,
"loss": 1.1761,
"step": 675
},
{
"epoch": 0.10546227500536282,
"grad_norm": 0.2749616503715515,
"learning_rate": 0.00017118996954695553,
"loss": 1.0586,
"step": 676
},
{
"epoch": 0.10561828428791513,
"grad_norm": 0.27312207221984863,
"learning_rate": 0.00017109892915928535,
"loss": 1.1367,
"step": 677
},
{
"epoch": 0.10577429357046744,
"grad_norm": 0.29626578092575073,
"learning_rate": 0.00017100776944446781,
"loss": 1.4223,
"step": 678
},
{
"epoch": 0.10593030285301976,
"grad_norm": 0.24335867166519165,
"learning_rate": 0.00017091649055549855,
"loss": 1.1041,
"step": 679
},
{
"epoch": 0.10608631213557207,
"grad_norm": 0.3017411530017853,
"learning_rate": 0.0001708250926455733,
"loss": 1.2854,
"step": 680
},
{
"epoch": 0.10624232141812438,
"grad_norm": 0.2864495515823364,
"learning_rate": 0.00017073357586808752,
"loss": 1.2539,
"step": 681
},
{
"epoch": 0.1063983307006767,
"grad_norm": 0.27407294511795044,
"learning_rate": 0.0001706419403766361,
"loss": 1.3136,
"step": 682
},
{
"epoch": 0.106554339983229,
"grad_norm": 0.3100734055042267,
"learning_rate": 0.00017055018632501325,
"loss": 1.3231,
"step": 683
},
{
"epoch": 0.10671034926578131,
"grad_norm": 0.3091520071029663,
"learning_rate": 0.00017045831386721213,
"loss": 1.3513,
"step": 684
},
{
"epoch": 0.10686635854833362,
"grad_norm": 0.2930145561695099,
"learning_rate": 0.00017036632315742462,
"loss": 1.3292,
"step": 685
},
{
"epoch": 0.10702236783088594,
"grad_norm": 0.30808883905410767,
"learning_rate": 0.00017027421435004112,
"loss": 1.6094,
"step": 686
},
{
"epoch": 0.10717837711343825,
"grad_norm": 0.2715398073196411,
"learning_rate": 0.00017018198759965016,
"loss": 1.3641,
"step": 687
},
{
"epoch": 0.10733438639599056,
"grad_norm": 0.2844456732273102,
"learning_rate": 0.00017008964306103823,
"loss": 1.3933,
"step": 688
},
{
"epoch": 0.10749039567854288,
"grad_norm": 0.258504718542099,
"learning_rate": 0.00016999718088918955,
"loss": 1.0621,
"step": 689
},
{
"epoch": 0.10764640496109519,
"grad_norm": 0.28674831986427307,
"learning_rate": 0.00016990460123928575,
"loss": 1.2759,
"step": 690
},
{
"epoch": 0.10780241424364749,
"grad_norm": 0.3062899708747864,
"learning_rate": 0.0001698119042667056,
"loss": 1.1537,
"step": 691
},
{
"epoch": 0.1079584235261998,
"grad_norm": 0.2539708614349365,
"learning_rate": 0.00016971909012702483,
"loss": 1.1463,
"step": 692
},
{
"epoch": 0.10811443280875212,
"grad_norm": 0.30207210779190063,
"learning_rate": 0.00016962615897601573,
"loss": 1.4219,
"step": 693
},
{
"epoch": 0.10827044209130443,
"grad_norm": 0.28675806522369385,
"learning_rate": 0.00016953311096964705,
"loss": 1.1476,
"step": 694
},
{
"epoch": 0.10842645137385674,
"grad_norm": 0.33274316787719727,
"learning_rate": 0.00016943994626408363,
"loss": 1.3351,
"step": 695
},
{
"epoch": 0.10858246065640906,
"grad_norm": 0.2725004553794861,
"learning_rate": 0.00016934666501568617,
"loss": 1.1795,
"step": 696
},
{
"epoch": 0.10873846993896137,
"grad_norm": 0.29064077138900757,
"learning_rate": 0.00016925326738101098,
"loss": 1.4255,
"step": 697
},
{
"epoch": 0.10889447922151368,
"grad_norm": 0.3007811903953552,
"learning_rate": 0.00016915975351680968,
"loss": 1.1951,
"step": 698
},
{
"epoch": 0.109050488504066,
"grad_norm": 0.26098549365997314,
"learning_rate": 0.000169066123580029,
"loss": 1.0585,
"step": 699
},
{
"epoch": 0.1092064977866183,
"grad_norm": 0.36355966329574585,
"learning_rate": 0.00016897237772781044,
"loss": 1.2911,
"step": 700
},
{
"epoch": 0.10936250706917061,
"grad_norm": 0.2830749750137329,
"learning_rate": 0.00016887851611749005,
"loss": 1.4469,
"step": 701
},
{
"epoch": 0.10951851635172292,
"grad_norm": 0.3175537884235382,
"learning_rate": 0.00016878453890659814,
"loss": 1.4589,
"step": 702
},
{
"epoch": 0.10967452563427524,
"grad_norm": 0.2898159623146057,
"learning_rate": 0.0001686904462528591,
"loss": 1.4318,
"step": 703
},
{
"epoch": 0.10983053491682755,
"grad_norm": 0.28991106152534485,
"learning_rate": 0.000168596238314191,
"loss": 1.3293,
"step": 704
},
{
"epoch": 0.10998654419937987,
"grad_norm": 0.27654772996902466,
"learning_rate": 0.00016850191524870546,
"loss": 1.4909,
"step": 705
},
{
"epoch": 0.11014255348193218,
"grad_norm": 0.29537513852119446,
"learning_rate": 0.00016840747721470731,
"loss": 1.4512,
"step": 706
},
{
"epoch": 0.11029856276448449,
"grad_norm": 0.2656291723251343,
"learning_rate": 0.00016831292437069427,
"loss": 1.0375,
"step": 707
},
{
"epoch": 0.11045457204703679,
"grad_norm": 0.3286688029766083,
"learning_rate": 0.00016821825687535674,
"loss": 1.3478,
"step": 708
},
{
"epoch": 0.1106105813295891,
"grad_norm": 0.2618601322174072,
"learning_rate": 0.00016812347488757772,
"loss": 1.3448,
"step": 709
},
{
"epoch": 0.11076659061214142,
"grad_norm": 0.29108762741088867,
"learning_rate": 0.00016802857856643215,
"loss": 1.3479,
"step": 710
},
{
"epoch": 0.11092259989469373,
"grad_norm": 0.3029685914516449,
"learning_rate": 0.00016793356807118695,
"loss": 1.2162,
"step": 711
},
{
"epoch": 0.11107860917724605,
"grad_norm": 0.2573980689048767,
"learning_rate": 0.00016783844356130071,
"loss": 1.0927,
"step": 712
},
{
"epoch": 0.11123461845979836,
"grad_norm": 0.2836451828479767,
"learning_rate": 0.0001677432051964233,
"loss": 1.2136,
"step": 713
},
{
"epoch": 0.11139062774235067,
"grad_norm": 0.2437037229537964,
"learning_rate": 0.0001676478531363957,
"loss": 1.0671,
"step": 714
},
{
"epoch": 0.11154663702490299,
"grad_norm": 0.2603608965873718,
"learning_rate": 0.00016755238754124965,
"loss": 1.2128,
"step": 715
},
{
"epoch": 0.1117026463074553,
"grad_norm": 0.2617943286895752,
"learning_rate": 0.00016745680857120757,
"loss": 1.3305,
"step": 716
},
{
"epoch": 0.1118586555900076,
"grad_norm": 0.27264609932899475,
"learning_rate": 0.00016736111638668204,
"loss": 1.3456,
"step": 717
},
{
"epoch": 0.11201466487255991,
"grad_norm": 0.33472567796707153,
"learning_rate": 0.00016726531114827573,
"loss": 1.2517,
"step": 718
},
{
"epoch": 0.11217067415511223,
"grad_norm": 0.2825791835784912,
"learning_rate": 0.00016716939301678098,
"loss": 1.3156,
"step": 719
},
{
"epoch": 0.11232668343766454,
"grad_norm": 0.2815983295440674,
"learning_rate": 0.00016707336215317968,
"loss": 1.2376,
"step": 720
},
{
"epoch": 0.11248269272021685,
"grad_norm": 0.3158409595489502,
"learning_rate": 0.00016697721871864284,
"loss": 1.5252,
"step": 721
},
{
"epoch": 0.11263870200276917,
"grad_norm": 0.27121129631996155,
"learning_rate": 0.00016688096287453046,
"loss": 1.3603,
"step": 722
},
{
"epoch": 0.11279471128532148,
"grad_norm": 0.2568758428096771,
"learning_rate": 0.00016678459478239118,
"loss": 1.1337,
"step": 723
},
{
"epoch": 0.1129507205678738,
"grad_norm": 0.26672929525375366,
"learning_rate": 0.00016668811460396202,
"loss": 1.1728,
"step": 724
},
{
"epoch": 0.1131067298504261,
"grad_norm": 0.2683919370174408,
"learning_rate": 0.00016659152250116812,
"loss": 1.2833,
"step": 725
},
{
"epoch": 0.11326273913297841,
"grad_norm": 0.2757527232170105,
"learning_rate": 0.00016649481863612248,
"loss": 1.0544,
"step": 726
},
{
"epoch": 0.11341874841553072,
"grad_norm": 0.2571371793746948,
"learning_rate": 0.0001663980031711257,
"loss": 1.1212,
"step": 727
},
{
"epoch": 0.11357475769808303,
"grad_norm": 0.2757047116756439,
"learning_rate": 0.00016630107626866558,
"loss": 1.1771,
"step": 728
},
{
"epoch": 0.11373076698063535,
"grad_norm": 0.262979120016098,
"learning_rate": 0.00016620403809141705,
"loss": 0.9962,
"step": 729
},
{
"epoch": 0.11388677626318766,
"grad_norm": 0.26567909121513367,
"learning_rate": 0.00016610688880224178,
"loss": 1.3037,
"step": 730
},
{
"epoch": 0.11404278554573997,
"grad_norm": 0.27931660413742065,
"learning_rate": 0.00016600962856418782,
"loss": 1.1863,
"step": 731
},
{
"epoch": 0.11419879482829229,
"grad_norm": 0.25071558356285095,
"learning_rate": 0.00016591225754048963,
"loss": 1.1437,
"step": 732
},
{
"epoch": 0.1143548041108446,
"grad_norm": 0.2775113880634308,
"learning_rate": 0.00016581477589456734,
"loss": 1.2152,
"step": 733
},
{
"epoch": 0.1145108133933969,
"grad_norm": 0.25055718421936035,
"learning_rate": 0.00016571718379002705,
"loss": 1.1479,
"step": 734
},
{
"epoch": 0.11466682267594921,
"grad_norm": 0.25468993186950684,
"learning_rate": 0.00016561948139065996,
"loss": 1.148,
"step": 735
},
{
"epoch": 0.11482283195850153,
"grad_norm": 0.26385918259620667,
"learning_rate": 0.00016552166886044253,
"loss": 1.3473,
"step": 736
},
{
"epoch": 0.11497884124105384,
"grad_norm": 0.27051180601119995,
"learning_rate": 0.00016542374636353604,
"loss": 1.196,
"step": 737
},
{
"epoch": 0.11513485052360616,
"grad_norm": 0.32731276750564575,
"learning_rate": 0.0001653257140642863,
"loss": 1.4514,
"step": 738
},
{
"epoch": 0.11529085980615847,
"grad_norm": 0.26046180725097656,
"learning_rate": 0.00016522757212722344,
"loss": 1.2186,
"step": 739
},
{
"epoch": 0.11544686908871078,
"grad_norm": 0.2661746144294739,
"learning_rate": 0.00016512932071706152,
"loss": 1.123,
"step": 740
},
{
"epoch": 0.1156028783712631,
"grad_norm": 0.25739923119544983,
"learning_rate": 0.0001650309599986985,
"loss": 1.1832,
"step": 741
},
{
"epoch": 0.1157588876538154,
"grad_norm": 0.30230990052223206,
"learning_rate": 0.00016493249013721558,
"loss": 1.5064,
"step": 742
},
{
"epoch": 0.11591489693636771,
"grad_norm": 0.25831449031829834,
"learning_rate": 0.00016483391129787727,
"loss": 1.1212,
"step": 743
},
{
"epoch": 0.11607090621892002,
"grad_norm": 0.24019654095172882,
"learning_rate": 0.000164735223646131,
"loss": 1.1555,
"step": 744
},
{
"epoch": 0.11622691550147234,
"grad_norm": 0.28396427631378174,
"learning_rate": 0.0001646364273476067,
"loss": 1.4754,
"step": 745
},
{
"epoch": 0.11638292478402465,
"grad_norm": 0.28211066126823425,
"learning_rate": 0.00016453752256811674,
"loss": 1.526,
"step": 746
},
{
"epoch": 0.11653893406657696,
"grad_norm": 0.2596474289894104,
"learning_rate": 0.00016443850947365558,
"loss": 1.2072,
"step": 747
},
{
"epoch": 0.11669494334912928,
"grad_norm": 0.25947293639183044,
"learning_rate": 0.0001643393882303994,
"loss": 1.3467,
"step": 748
},
{
"epoch": 0.11685095263168159,
"grad_norm": 0.30946600437164307,
"learning_rate": 0.00016424015900470587,
"loss": 1.3948,
"step": 749
},
{
"epoch": 0.1170069619142339,
"grad_norm": 0.3172161281108856,
"learning_rate": 0.000164140821963114,
"loss": 1.745,
"step": 750
},
{
"epoch": 0.1171629711967862,
"grad_norm": 0.26674196124076843,
"learning_rate": 0.00016404137727234365,
"loss": 1.5021,
"step": 751
},
{
"epoch": 0.11731898047933852,
"grad_norm": 0.26941999793052673,
"learning_rate": 0.00016394182509929536,
"loss": 1.2651,
"step": 752
},
{
"epoch": 0.11747498976189083,
"grad_norm": 0.29353249073028564,
"learning_rate": 0.00016384216561105014,
"loss": 1.2397,
"step": 753
},
{
"epoch": 0.11763099904444314,
"grad_norm": 0.2547638416290283,
"learning_rate": 0.000163742398974869,
"loss": 1.1032,
"step": 754
},
{
"epoch": 0.11778700832699546,
"grad_norm": 0.25621354579925537,
"learning_rate": 0.00016364252535819282,
"loss": 1.0842,
"step": 755
},
{
"epoch": 0.11794301760954777,
"grad_norm": 0.25465261936187744,
"learning_rate": 0.00016354254492864211,
"loss": 0.9941,
"step": 756
},
{
"epoch": 0.11809902689210008,
"grad_norm": 0.25726544857025146,
"learning_rate": 0.00016344245785401653,
"loss": 1.2613,
"step": 757
},
{
"epoch": 0.1182550361746524,
"grad_norm": 0.2696760594844818,
"learning_rate": 0.00016334226430229475,
"loss": 1.1349,
"step": 758
},
{
"epoch": 0.1184110454572047,
"grad_norm": 0.29465997219085693,
"learning_rate": 0.00016324196444163423,
"loss": 1.3099,
"step": 759
},
{
"epoch": 0.11856705473975701,
"grad_norm": 0.2854841351509094,
"learning_rate": 0.00016314155844037074,
"loss": 1.1648,
"step": 760
},
{
"epoch": 0.11872306402230932,
"grad_norm": 0.28557366132736206,
"learning_rate": 0.0001630410464670182,
"loss": 1.4045,
"step": 761
},
{
"epoch": 0.11887907330486164,
"grad_norm": 0.337882936000824,
"learning_rate": 0.00016294042869026851,
"loss": 1.4391,
"step": 762
},
{
"epoch": 0.11903508258741395,
"grad_norm": 0.25410857796669006,
"learning_rate": 0.000162839705278991,
"loss": 1.025,
"step": 763
},
{
"epoch": 0.11919109186996626,
"grad_norm": 0.2944369614124298,
"learning_rate": 0.0001627388764022323,
"loss": 1.3339,
"step": 764
},
{
"epoch": 0.11934710115251858,
"grad_norm": 0.30941835045814514,
"learning_rate": 0.0001626379422292162,
"loss": 1.5238,
"step": 765
},
{
"epoch": 0.11950311043507089,
"grad_norm": 0.2796765863895416,
"learning_rate": 0.000162536902929343,
"loss": 1.1711,
"step": 766
},
{
"epoch": 0.1196591197176232,
"grad_norm": 0.2882195711135864,
"learning_rate": 0.00016243575867218958,
"loss": 1.2852,
"step": 767
},
{
"epoch": 0.1198151290001755,
"grad_norm": 0.29050207138061523,
"learning_rate": 0.00016233450962750893,
"loss": 1.2789,
"step": 768
},
{
"epoch": 0.11997113828272782,
"grad_norm": 0.2745670974254608,
"learning_rate": 0.00016223315596522987,
"loss": 1.2741,
"step": 769
},
{
"epoch": 0.12012714756528013,
"grad_norm": 0.29764166474342346,
"learning_rate": 0.0001621316978554569,
"loss": 1.3636,
"step": 770
},
{
"epoch": 0.12028315684783245,
"grad_norm": 0.29131025075912476,
"learning_rate": 0.00016203013546846966,
"loss": 1.5137,
"step": 771
},
{
"epoch": 0.12043916613038476,
"grad_norm": 0.3370944857597351,
"learning_rate": 0.00016192846897472297,
"loss": 1.5541,
"step": 772
},
{
"epoch": 0.12059517541293707,
"grad_norm": 0.2678642272949219,
"learning_rate": 0.0001618266985448463,
"loss": 1.2024,
"step": 773
},
{
"epoch": 0.12075118469548939,
"grad_norm": 0.27655884623527527,
"learning_rate": 0.00016172482434964353,
"loss": 1.1084,
"step": 774
},
{
"epoch": 0.1209071939780417,
"grad_norm": 0.23235641419887543,
"learning_rate": 0.00016162284656009274,
"loss": 0.8548,
"step": 775
},
{
"epoch": 0.121063203260594,
"grad_norm": 0.2860414683818817,
"learning_rate": 0.00016152076534734584,
"loss": 1.5026,
"step": 776
},
{
"epoch": 0.12121921254314631,
"grad_norm": 0.2980406582355499,
"learning_rate": 0.00016141858088272837,
"loss": 1.3692,
"step": 777
},
{
"epoch": 0.12137522182569863,
"grad_norm": 0.29564347863197327,
"learning_rate": 0.00016131629333773908,
"loss": 1.6193,
"step": 778
},
{
"epoch": 0.12153123110825094,
"grad_norm": 0.250028520822525,
"learning_rate": 0.0001612139028840498,
"loss": 1.3295,
"step": 779
},
{
"epoch": 0.12168724039080325,
"grad_norm": 0.25812971591949463,
"learning_rate": 0.00016111140969350503,
"loss": 1.1061,
"step": 780
},
{
"epoch": 0.12184324967335557,
"grad_norm": 0.2702666223049164,
"learning_rate": 0.0001610088139381217,
"loss": 1.2846,
"step": 781
},
{
"epoch": 0.12199925895590788,
"grad_norm": 0.24256417155265808,
"learning_rate": 0.00016090611579008888,
"loss": 1.081,
"step": 782
},
{
"epoch": 0.1221552682384602,
"grad_norm": 0.3177904784679413,
"learning_rate": 0.00016080331542176753,
"loss": 1.5862,
"step": 783
},
{
"epoch": 0.12231127752101251,
"grad_norm": 0.25483664870262146,
"learning_rate": 0.00016070041300569012,
"loss": 1.1939,
"step": 784
},
{
"epoch": 0.1224672868035648,
"grad_norm": 0.23578673601150513,
"learning_rate": 0.00016059740871456036,
"loss": 1.0371,
"step": 785
},
{
"epoch": 0.12262329608611712,
"grad_norm": 0.28674736618995667,
"learning_rate": 0.000160494302721253,
"loss": 1.4739,
"step": 786
},
{
"epoch": 0.12277930536866943,
"grad_norm": 0.29090616106987,
"learning_rate": 0.0001603910951988135,
"loss": 1.3862,
"step": 787
},
{
"epoch": 0.12293531465122175,
"grad_norm": 0.2792899012565613,
"learning_rate": 0.00016028778632045762,
"loss": 1.3731,
"step": 788
},
{
"epoch": 0.12309132393377406,
"grad_norm": 0.2683924436569214,
"learning_rate": 0.00016018437625957133,
"loss": 1.4514,
"step": 789
},
{
"epoch": 0.12324733321632637,
"grad_norm": 0.331752747297287,
"learning_rate": 0.00016008086518971037,
"loss": 1.0936,
"step": 790
},
{
"epoch": 0.12340334249887869,
"grad_norm": 0.32185712456703186,
"learning_rate": 0.0001599772532846,
"loss": 1.7093,
"step": 791
},
{
"epoch": 0.123559351781431,
"grad_norm": 0.28801560401916504,
"learning_rate": 0.0001598735407181347,
"loss": 1.2923,
"step": 792
},
{
"epoch": 0.1237153610639833,
"grad_norm": 0.2626672387123108,
"learning_rate": 0.00015976972766437795,
"loss": 1.196,
"step": 793
},
{
"epoch": 0.12387137034653561,
"grad_norm": 0.30561795830726624,
"learning_rate": 0.00015966581429756183,
"loss": 1.5151,
"step": 794
},
{
"epoch": 0.12402737962908793,
"grad_norm": 0.2764839828014374,
"learning_rate": 0.00015956180079208682,
"loss": 1.231,
"step": 795
},
{
"epoch": 0.12418338891164024,
"grad_norm": 0.2506803870201111,
"learning_rate": 0.00015945768732252144,
"loss": 1.0394,
"step": 796
},
{
"epoch": 0.12433939819419255,
"grad_norm": 0.28655874729156494,
"learning_rate": 0.00015935347406360192,
"loss": 1.4689,
"step": 797
},
{
"epoch": 0.12449540747674487,
"grad_norm": 0.26048576831817627,
"learning_rate": 0.00015924916119023212,
"loss": 1.218,
"step": 798
},
{
"epoch": 0.12465141675929718,
"grad_norm": 0.26712656021118164,
"learning_rate": 0.00015914474887748295,
"loss": 1.232,
"step": 799
},
{
"epoch": 0.1248074260418495,
"grad_norm": 0.2652023434638977,
"learning_rate": 0.00015904023730059228,
"loss": 1.0205,
"step": 800
},
{
"epoch": 0.12496343532440181,
"grad_norm": 0.3364275097846985,
"learning_rate": 0.0001589356266349645,
"loss": 1.4919,
"step": 801
},
{
"epoch": 0.12511944460695412,
"grad_norm": 0.218467116355896,
"learning_rate": 0.00015883091705617045,
"loss": 0.8939,
"step": 802
},
{
"epoch": 0.12527545388950642,
"grad_norm": 0.2554807960987091,
"learning_rate": 0.00015872610873994685,
"loss": 1.2568,
"step": 803
},
{
"epoch": 0.12543146317205875,
"grad_norm": 0.2742806673049927,
"learning_rate": 0.00015862120186219613,
"loss": 1.0565,
"step": 804
},
{
"epoch": 0.12558747245461105,
"grad_norm": 0.23994481563568115,
"learning_rate": 0.00015851619659898623,
"loss": 0.9631,
"step": 805
},
{
"epoch": 0.12574348173716335,
"grad_norm": 0.29549404978752136,
"learning_rate": 0.00015841109312655016,
"loss": 1.2073,
"step": 806
},
{
"epoch": 0.12589949101971568,
"grad_norm": 0.27470991015434265,
"learning_rate": 0.00015830589162128572,
"loss": 1.2345,
"step": 807
},
{
"epoch": 0.12605550030226798,
"grad_norm": 0.27652519941329956,
"learning_rate": 0.00015820059225975531,
"loss": 1.2456,
"step": 808
},
{
"epoch": 0.1262115095848203,
"grad_norm": 0.2571077346801758,
"learning_rate": 0.0001580951952186856,
"loss": 1.0009,
"step": 809
},
{
"epoch": 0.1263675188673726,
"grad_norm": 0.27721402049064636,
"learning_rate": 0.000157989700674967,
"loss": 1.2101,
"step": 810
},
{
"epoch": 0.12652352814992493,
"grad_norm": 0.29823631048202515,
"learning_rate": 0.00015788410880565379,
"loss": 1.3992,
"step": 811
},
{
"epoch": 0.12667953743247723,
"grad_norm": 0.28366366028785706,
"learning_rate": 0.00015777841978796347,
"loss": 1.005,
"step": 812
},
{
"epoch": 0.12683554671502956,
"grad_norm": 0.3597376048564911,
"learning_rate": 0.0001576726337992766,
"loss": 1.6046,
"step": 813
},
{
"epoch": 0.12699155599758186,
"grad_norm": 0.27407100796699524,
"learning_rate": 0.00015756675101713657,
"loss": 1.0167,
"step": 814
},
{
"epoch": 0.12714756528013416,
"grad_norm": 0.3212680220603943,
"learning_rate": 0.00015746077161924905,
"loss": 1.4425,
"step": 815
},
{
"epoch": 0.12730357456268648,
"grad_norm": 0.25150859355926514,
"learning_rate": 0.00015735469578348208,
"loss": 1.2482,
"step": 816
},
{
"epoch": 0.12745958384523878,
"grad_norm": 0.2753000855445862,
"learning_rate": 0.00015724852368786537,
"loss": 1.3006,
"step": 817
},
{
"epoch": 0.1276155931277911,
"grad_norm": 0.27500027418136597,
"learning_rate": 0.0001571422555105903,
"loss": 1.2095,
"step": 818
},
{
"epoch": 0.1277716024103434,
"grad_norm": 0.2696485221385956,
"learning_rate": 0.0001570358914300094,
"loss": 1.1708,
"step": 819
},
{
"epoch": 0.12792761169289574,
"grad_norm": 0.2486962080001831,
"learning_rate": 0.00015692943162463628,
"loss": 1.0531,
"step": 820
},
{
"epoch": 0.12808362097544804,
"grad_norm": 0.265824556350708,
"learning_rate": 0.00015682287627314515,
"loss": 1.0712,
"step": 821
},
{
"epoch": 0.12823963025800036,
"grad_norm": 0.2963060140609741,
"learning_rate": 0.00015671622555437053,
"loss": 1.3806,
"step": 822
},
{
"epoch": 0.12839563954055266,
"grad_norm": 0.2849713861942291,
"learning_rate": 0.00015660947964730708,
"loss": 1.2242,
"step": 823
},
{
"epoch": 0.12855164882310496,
"grad_norm": 0.25108298659324646,
"learning_rate": 0.0001565026387311092,
"loss": 1.1128,
"step": 824
},
{
"epoch": 0.1287076581056573,
"grad_norm": 0.27622735500335693,
"learning_rate": 0.00015639570298509064,
"loss": 1.3599,
"step": 825
},
{
"epoch": 0.1288636673882096,
"grad_norm": 0.29195183515548706,
"learning_rate": 0.0001562886725887245,
"loss": 1.2931,
"step": 826
},
{
"epoch": 0.12901967667076192,
"grad_norm": 0.2943118214607239,
"learning_rate": 0.00015618154772164256,
"loss": 1.5802,
"step": 827
},
{
"epoch": 0.12917568595331422,
"grad_norm": 0.26325714588165283,
"learning_rate": 0.00015607432856363525,
"loss": 1.2455,
"step": 828
},
{
"epoch": 0.12933169523586655,
"grad_norm": 0.286743700504303,
"learning_rate": 0.00015596701529465117,
"loss": 1.3008,
"step": 829
},
{
"epoch": 0.12948770451841884,
"grad_norm": 0.2844702899456024,
"learning_rate": 0.00015585960809479696,
"loss": 1.3737,
"step": 830
},
{
"epoch": 0.12964371380097114,
"grad_norm": 0.25531789660453796,
"learning_rate": 0.00015575210714433686,
"loss": 1.1425,
"step": 831
},
{
"epoch": 0.12979972308352347,
"grad_norm": 0.26921185851097107,
"learning_rate": 0.00015564451262369247,
"loss": 1.106,
"step": 832
},
{
"epoch": 0.12995573236607577,
"grad_norm": 0.28271836042404175,
"learning_rate": 0.00015553682471344238,
"loss": 1.3681,
"step": 833
},
{
"epoch": 0.1301117416486281,
"grad_norm": 0.26876282691955566,
"learning_rate": 0.00015542904359432198,
"loss": 1.112,
"step": 834
},
{
"epoch": 0.1302677509311804,
"grad_norm": 0.2895980179309845,
"learning_rate": 0.00015532116944722308,
"loss": 1.1285,
"step": 835
},
{
"epoch": 0.13042376021373273,
"grad_norm": 0.2612462639808655,
"learning_rate": 0.00015521320245319363,
"loss": 1.2669,
"step": 836
},
{
"epoch": 0.13057976949628503,
"grad_norm": 0.30689284205436707,
"learning_rate": 0.00015510514279343734,
"loss": 1.3512,
"step": 837
},
{
"epoch": 0.13073577877883735,
"grad_norm": 0.2981073558330536,
"learning_rate": 0.00015499699064931355,
"loss": 1.1284,
"step": 838
},
{
"epoch": 0.13089178806138965,
"grad_norm": 0.2637684643268585,
"learning_rate": 0.00015488874620233674,
"loss": 1.0698,
"step": 839
},
{
"epoch": 0.13104779734394195,
"grad_norm": 0.3048469126224518,
"learning_rate": 0.0001547804096341763,
"loss": 1.5209,
"step": 840
},
{
"epoch": 0.13120380662649428,
"grad_norm": 0.2396387904882431,
"learning_rate": 0.00015467198112665632,
"loss": 0.9584,
"step": 841
},
{
"epoch": 0.13135981590904658,
"grad_norm": 0.27103736996650696,
"learning_rate": 0.0001545634608617551,
"loss": 1.2846,
"step": 842
},
{
"epoch": 0.1315158251915989,
"grad_norm": 0.2971721589565277,
"learning_rate": 0.00015445484902160491,
"loss": 1.6074,
"step": 843
},
{
"epoch": 0.1316718344741512,
"grad_norm": 0.2440243512392044,
"learning_rate": 0.00015434614578849188,
"loss": 1.045,
"step": 844
},
{
"epoch": 0.13182784375670353,
"grad_norm": 0.30210787057876587,
"learning_rate": 0.00015423735134485536,
"loss": 1.2948,
"step": 845
},
{
"epoch": 0.13198385303925583,
"grad_norm": 0.25344711542129517,
"learning_rate": 0.00015412846587328782,
"loss": 1.2089,
"step": 846
},
{
"epoch": 0.13213986232180816,
"grad_norm": 0.2884974479675293,
"learning_rate": 0.0001540194895565346,
"loss": 1.1123,
"step": 847
},
{
"epoch": 0.13229587160436046,
"grad_norm": 0.28012582659721375,
"learning_rate": 0.00015391042257749336,
"loss": 1.2269,
"step": 848
},
{
"epoch": 0.13245188088691276,
"grad_norm": 0.26394879817962646,
"learning_rate": 0.00015380126511921403,
"loss": 1.4469,
"step": 849
},
{
"epoch": 0.1326078901694651,
"grad_norm": 0.2717582583427429,
"learning_rate": 0.0001536920173648984,
"loss": 1.1494,
"step": 850
},
{
"epoch": 0.1327638994520174,
"grad_norm": 0.2968549132347107,
"learning_rate": 0.00015358267949789966,
"loss": 1.1903,
"step": 851
},
{
"epoch": 0.13291990873456971,
"grad_norm": 0.2570381164550781,
"learning_rate": 0.00015347325170172245,
"loss": 1.1035,
"step": 852
},
{
"epoch": 0.133075918017122,
"grad_norm": 0.3070929944515228,
"learning_rate": 0.0001533637341600221,
"loss": 1.4062,
"step": 853
},
{
"epoch": 0.13323192729967434,
"grad_norm": 0.2886407971382141,
"learning_rate": 0.0001532541270566049,
"loss": 1.3491,
"step": 854
},
{
"epoch": 0.13338793658222664,
"grad_norm": 0.2572009861469269,
"learning_rate": 0.00015314443057542703,
"loss": 1.2643,
"step": 855
},
{
"epoch": 0.13354394586477897,
"grad_norm": 0.2768828272819519,
"learning_rate": 0.00015303464490059506,
"loss": 1.1444,
"step": 856
},
{
"epoch": 0.13369995514733127,
"grad_norm": 0.3006720542907715,
"learning_rate": 0.00015292477021636497,
"loss": 1.2172,
"step": 857
},
{
"epoch": 0.13385596442988357,
"grad_norm": 0.24407751858234406,
"learning_rate": 0.0001528148067071423,
"loss": 0.9457,
"step": 858
},
{
"epoch": 0.1340119737124359,
"grad_norm": 0.25638723373413086,
"learning_rate": 0.00015270475455748166,
"loss": 1.1478,
"step": 859
},
{
"epoch": 0.1341679829949882,
"grad_norm": 0.24834637343883514,
"learning_rate": 0.00015259461395208628,
"loss": 0.9835,
"step": 860
},
{
"epoch": 0.13432399227754052,
"grad_norm": 0.2611735463142395,
"learning_rate": 0.00015248438507580806,
"loss": 1.125,
"step": 861
},
{
"epoch": 0.13448000156009282,
"grad_norm": 0.3239066004753113,
"learning_rate": 0.00015237406811364682,
"loss": 1.1973,
"step": 862
},
{
"epoch": 0.13463601084264515,
"grad_norm": 0.2662723958492279,
"learning_rate": 0.0001522636632507504,
"loss": 1.1115,
"step": 863
},
{
"epoch": 0.13479202012519745,
"grad_norm": 0.26053330302238464,
"learning_rate": 0.00015215317067241414,
"loss": 1.0885,
"step": 864
},
{
"epoch": 0.13494802940774975,
"grad_norm": 0.337984561920166,
"learning_rate": 0.00015204259056408046,
"loss": 0.8782,
"step": 865
},
{
"epoch": 0.13510403869030208,
"grad_norm": 0.2965889871120453,
"learning_rate": 0.00015193192311133884,
"loss": 1.3198,
"step": 866
},
{
"epoch": 0.13526004797285437,
"grad_norm": 0.3056474030017853,
"learning_rate": 0.00015182116849992526,
"loss": 1.5133,
"step": 867
},
{
"epoch": 0.1354160572554067,
"grad_norm": 0.29193446040153503,
"learning_rate": 0.00015171032691572206,
"loss": 1.2365,
"step": 868
},
{
"epoch": 0.135572066537959,
"grad_norm": 0.28123265504837036,
"learning_rate": 0.00015159939854475743,
"loss": 1.1654,
"step": 869
},
{
"epoch": 0.13572807582051133,
"grad_norm": 0.3033466041088104,
"learning_rate": 0.00015148838357320537,
"loss": 1.5473,
"step": 870
},
{
"epoch": 0.13588408510306363,
"grad_norm": 0.26069045066833496,
"learning_rate": 0.00015137728218738502,
"loss": 1.2213,
"step": 871
},
{
"epoch": 0.13604009438561596,
"grad_norm": 0.3010377883911133,
"learning_rate": 0.0001512660945737608,
"loss": 1.1906,
"step": 872
},
{
"epoch": 0.13619610366816826,
"grad_norm": 0.2615121304988861,
"learning_rate": 0.00015115482091894165,
"loss": 1.0807,
"step": 873
},
{
"epoch": 0.13635211295072056,
"grad_norm": 0.27064162492752075,
"learning_rate": 0.00015104346140968095,
"loss": 1.3376,
"step": 874
},
{
"epoch": 0.13650812223327288,
"grad_norm": 0.26106327772140503,
"learning_rate": 0.00015093201623287631,
"loss": 1.2357,
"step": 875
},
{
"epoch": 0.13666413151582518,
"grad_norm": 0.26505109667778015,
"learning_rate": 0.00015082048557556893,
"loss": 1.4311,
"step": 876
},
{
"epoch": 0.1368201407983775,
"grad_norm": 0.2965877950191498,
"learning_rate": 0.00015070886962494358,
"loss": 1.3246,
"step": 877
},
{
"epoch": 0.1369761500809298,
"grad_norm": 0.3173799216747284,
"learning_rate": 0.0001505971685683282,
"loss": 1.4795,
"step": 878
},
{
"epoch": 0.13713215936348214,
"grad_norm": 0.2562354505062103,
"learning_rate": 0.00015048538259319346,
"loss": 1.0112,
"step": 879
},
{
"epoch": 0.13728816864603444,
"grad_norm": 0.2736887037754059,
"learning_rate": 0.00015037351188715265,
"loss": 1.3539,
"step": 880
},
{
"epoch": 0.13744417792858676,
"grad_norm": 0.30376073718070984,
"learning_rate": 0.00015026155663796123,
"loss": 1.2837,
"step": 881
},
{
"epoch": 0.13760018721113906,
"grad_norm": 0.3052879869937897,
"learning_rate": 0.00015014951703351653,
"loss": 1.3994,
"step": 882
},
{
"epoch": 0.13775619649369136,
"grad_norm": 0.25414812564849854,
"learning_rate": 0.00015003739326185751,
"loss": 0.9258,
"step": 883
},
{
"epoch": 0.1379122057762437,
"grad_norm": 0.33165043592453003,
"learning_rate": 0.00014992518551116434,
"loss": 1.4427,
"step": 884
},
{
"epoch": 0.138068215058796,
"grad_norm": 0.2764113247394562,
"learning_rate": 0.00014981289396975817,
"loss": 1.3084,
"step": 885
},
{
"epoch": 0.13822422434134832,
"grad_norm": 0.3221314251422882,
"learning_rate": 0.0001497005188261007,
"loss": 1.0262,
"step": 886
},
{
"epoch": 0.13838023362390062,
"grad_norm": 0.24285611510276794,
"learning_rate": 0.0001495880602687941,
"loss": 1.1275,
"step": 887
},
{
"epoch": 0.13853624290645294,
"grad_norm": 0.27305787801742554,
"learning_rate": 0.00014947551848658034,
"loss": 1.3409,
"step": 888
},
{
"epoch": 0.13869225218900524,
"grad_norm": 0.29822468757629395,
"learning_rate": 0.00014936289366834123,
"loss": 1.3696,
"step": 889
},
{
"epoch": 0.13884826147155757,
"grad_norm": 0.259112685918808,
"learning_rate": 0.00014925018600309785,
"loss": 1.2456,
"step": 890
},
{
"epoch": 0.13900427075410987,
"grad_norm": 0.28749990463256836,
"learning_rate": 0.00014913739568001033,
"loss": 1.2809,
"step": 891
},
{
"epoch": 0.13916028003666217,
"grad_norm": 0.24120725691318512,
"learning_rate": 0.0001490245228883776,
"loss": 1.1092,
"step": 892
},
{
"epoch": 0.1393162893192145,
"grad_norm": 0.2791595160961151,
"learning_rate": 0.0001489115678176369,
"loss": 1.024,
"step": 893
},
{
"epoch": 0.1394722986017668,
"grad_norm": 0.260062038898468,
"learning_rate": 0.00014879853065736365,
"loss": 1.1766,
"step": 894
},
{
"epoch": 0.13962830788431912,
"grad_norm": 0.2642684280872345,
"learning_rate": 0.00014868541159727096,
"loss": 1.3869,
"step": 895
},
{
"epoch": 0.13978431716687142,
"grad_norm": 0.2463667243719101,
"learning_rate": 0.00014857221082720948,
"loss": 1.0662,
"step": 896
},
{
"epoch": 0.13994032644942375,
"grad_norm": 0.2916738986968994,
"learning_rate": 0.0001484589285371669,
"loss": 1.3209,
"step": 897
},
{
"epoch": 0.14009633573197605,
"grad_norm": 0.27236512303352356,
"learning_rate": 0.0001483455649172678,
"loss": 1.1833,
"step": 898
},
{
"epoch": 0.14025234501452835,
"grad_norm": 0.2619946002960205,
"learning_rate": 0.0001482321201577733,
"loss": 1.3137,
"step": 899
},
{
"epoch": 0.14040835429708068,
"grad_norm": 0.31396883726119995,
"learning_rate": 0.00014811859444908052,
"loss": 1.3727,
"step": 900
},
{
"epoch": 0.14056436357963298,
"grad_norm": 0.25572189688682556,
"learning_rate": 0.0001480049879817226,
"loss": 1.1046,
"step": 901
},
{
"epoch": 0.1407203728621853,
"grad_norm": 0.2937905490398407,
"learning_rate": 0.0001478913009463682,
"loss": 1.3542,
"step": 902
},
{
"epoch": 0.1408763821447376,
"grad_norm": 0.253520131111145,
"learning_rate": 0.00014777753353382119,
"loss": 1.2329,
"step": 903
},
{
"epoch": 0.14103239142728993,
"grad_norm": 0.32491999864578247,
"learning_rate": 0.00014766368593502026,
"loss": 1.3285,
"step": 904
},
{
"epoch": 0.14118840070984223,
"grad_norm": 0.2527139484882355,
"learning_rate": 0.00014754975834103877,
"loss": 1.1277,
"step": 905
},
{
"epoch": 0.14134440999239456,
"grad_norm": 0.275272399187088,
"learning_rate": 0.00014743575094308431,
"loss": 1.4177,
"step": 906
},
{
"epoch": 0.14150041927494686,
"grad_norm": 0.26013612747192383,
"learning_rate": 0.0001473216639324984,
"loss": 1.2476,
"step": 907
},
{
"epoch": 0.14165642855749916,
"grad_norm": 0.28431418538093567,
"learning_rate": 0.0001472074975007562,
"loss": 1.3947,
"step": 908
},
{
"epoch": 0.1418124378400515,
"grad_norm": 0.2629927396774292,
"learning_rate": 0.0001470932518394661,
"loss": 1.1587,
"step": 909
},
{
"epoch": 0.14196844712260379,
"grad_norm": 0.2944284975528717,
"learning_rate": 0.00014697892714036958,
"loss": 1.342,
"step": 910
},
{
"epoch": 0.1421244564051561,
"grad_norm": 0.31365662813186646,
"learning_rate": 0.00014686452359534066,
"loss": 1.4326,
"step": 911
},
{
"epoch": 0.1422804656877084,
"grad_norm": 0.255875825881958,
"learning_rate": 0.0001467500413963857,
"loss": 1.2305,
"step": 912
},
{
"epoch": 0.14243647497026074,
"grad_norm": 0.2717350423336029,
"learning_rate": 0.00014663548073564316,
"loss": 1.1965,
"step": 913
},
{
"epoch": 0.14259248425281304,
"grad_norm": 0.28059136867523193,
"learning_rate": 0.00014652084180538302,
"loss": 1.3361,
"step": 914
},
{
"epoch": 0.14274849353536537,
"grad_norm": 0.2790951430797577,
"learning_rate": 0.00014640612479800686,
"loss": 1.2785,
"step": 915
},
{
"epoch": 0.14290450281791767,
"grad_norm": 0.24599488079547882,
"learning_rate": 0.00014629132990604706,
"loss": 1.2433,
"step": 916
},
{
"epoch": 0.14306051210046997,
"grad_norm": 0.288792222738266,
"learning_rate": 0.00014617645732216685,
"loss": 1.1779,
"step": 917
},
{
"epoch": 0.1432165213830223,
"grad_norm": 0.3035881221294403,
"learning_rate": 0.00014606150723915984,
"loss": 1.3885,
"step": 918
},
{
"epoch": 0.1433725306655746,
"grad_norm": 0.28884077072143555,
"learning_rate": 0.00014594647984994964,
"loss": 1.3079,
"step": 919
},
{
"epoch": 0.14352853994812692,
"grad_norm": 0.26054033637046814,
"learning_rate": 0.00014583137534758967,
"loss": 1.1897,
"step": 920
},
{
"epoch": 0.14368454923067922,
"grad_norm": 0.31249237060546875,
"learning_rate": 0.00014571619392526278,
"loss": 1.4518,
"step": 921
},
{
"epoch": 0.14384055851323155,
"grad_norm": 0.27947118878364563,
"learning_rate": 0.0001456009357762809,
"loss": 1.2305,
"step": 922
},
{
"epoch": 0.14399656779578385,
"grad_norm": 0.2928619980812073,
"learning_rate": 0.00014548560109408466,
"loss": 1.3645,
"step": 923
},
{
"epoch": 0.14415257707833617,
"grad_norm": 0.2735868990421295,
"learning_rate": 0.00014537019007224324,
"loss": 1.4351,
"step": 924
},
{
"epoch": 0.14430858636088847,
"grad_norm": 0.30757883191108704,
"learning_rate": 0.00014525470290445392,
"loss": 1.4073,
"step": 925
},
{
"epoch": 0.14446459564344077,
"grad_norm": 0.28719013929367065,
"learning_rate": 0.00014513913978454168,
"loss": 1.2918,
"step": 926
},
{
"epoch": 0.1446206049259931,
"grad_norm": 0.2720332145690918,
"learning_rate": 0.00014502350090645917,
"loss": 1.2763,
"step": 927
},
{
"epoch": 0.1447766142085454,
"grad_norm": 0.24720966815948486,
"learning_rate": 0.000144907786464286,
"loss": 1.0549,
"step": 928
},
{
"epoch": 0.14493262349109773,
"grad_norm": 0.3164946138858795,
"learning_rate": 0.0001447919966522287,
"loss": 1.1007,
"step": 929
},
{
"epoch": 0.14508863277365003,
"grad_norm": 0.2940044105052948,
"learning_rate": 0.00014467613166462023,
"loss": 1.2818,
"step": 930
},
{
"epoch": 0.14524464205620236,
"grad_norm": 0.34050655364990234,
"learning_rate": 0.00014456019169591978,
"loss": 1.2618,
"step": 931
},
{
"epoch": 0.14540065133875466,
"grad_norm": 0.24612417817115784,
"learning_rate": 0.0001444441769407124,
"loss": 0.991,
"step": 932
},
{
"epoch": 0.14555666062130695,
"grad_norm": 0.2636529505252838,
"learning_rate": 0.00014432808759370854,
"loss": 1.4259,
"step": 933
},
{
"epoch": 0.14571266990385928,
"grad_norm": 0.2628234624862671,
"learning_rate": 0.00014421192384974396,
"loss": 1.2545,
"step": 934
},
{
"epoch": 0.14586867918641158,
"grad_norm": 0.2733708918094635,
"learning_rate": 0.00014409568590377918,
"loss": 1.1442,
"step": 935
},
{
"epoch": 0.1460246884689639,
"grad_norm": 0.24912774562835693,
"learning_rate": 0.0001439793739508994,
"loss": 1.039,
"step": 936
},
{
"epoch": 0.1461806977515162,
"grad_norm": 0.2927952706813812,
"learning_rate": 0.00014386298818631386,
"loss": 1.179,
"step": 937
},
{
"epoch": 0.14633670703406854,
"grad_norm": 0.29066377878189087,
"learning_rate": 0.0001437465288053558,
"loss": 1.2024,
"step": 938
},
{
"epoch": 0.14649271631662084,
"grad_norm": 0.2862846553325653,
"learning_rate": 0.00014362999600348196,
"loss": 1.1401,
"step": 939
},
{
"epoch": 0.14664872559917316,
"grad_norm": 0.3009769022464752,
"learning_rate": 0.00014351338997627234,
"loss": 1.3966,
"step": 940
},
{
"epoch": 0.14680473488172546,
"grad_norm": 0.31753668189048767,
"learning_rate": 0.00014339671091942978,
"loss": 1.4626,
"step": 941
},
{
"epoch": 0.14696074416427776,
"grad_norm": 0.28623080253601074,
"learning_rate": 0.0001432799590287797,
"loss": 1.2841,
"step": 942
},
{
"epoch": 0.1471167534468301,
"grad_norm": 0.3344881534576416,
"learning_rate": 0.00014316313450026986,
"loss": 1.5589,
"step": 943
},
{
"epoch": 0.1472727627293824,
"grad_norm": 0.3132301867008209,
"learning_rate": 0.00014304623752996973,
"loss": 1.4286,
"step": 944
},
{
"epoch": 0.14742877201193472,
"grad_norm": 0.299078106880188,
"learning_rate": 0.00014292926831407061,
"loss": 1.2099,
"step": 945
},
{
"epoch": 0.14758478129448702,
"grad_norm": 0.27058905363082886,
"learning_rate": 0.0001428122270488848,
"loss": 1.2331,
"step": 946
},
{
"epoch": 0.14774079057703934,
"grad_norm": 0.3202461004257202,
"learning_rate": 0.00014269511393084572,
"loss": 1.0677,
"step": 947
},
{
"epoch": 0.14789679985959164,
"grad_norm": 0.3005964756011963,
"learning_rate": 0.00014257792915650728,
"loss": 1.3382,
"step": 948
},
{
"epoch": 0.14805280914214397,
"grad_norm": 0.28587067127227783,
"learning_rate": 0.00014246067292254366,
"loss": 1.2216,
"step": 949
},
{
"epoch": 0.14820881842469627,
"grad_norm": 0.27515730261802673,
"learning_rate": 0.00014234334542574906,
"loss": 1.1608,
"step": 950
},
{
"epoch": 0.14836482770724857,
"grad_norm": 0.26588740944862366,
"learning_rate": 0.00014222594686303706,
"loss": 1.1547,
"step": 951
},
{
"epoch": 0.1485208369898009,
"grad_norm": 0.3122014105319977,
"learning_rate": 0.00014210847743144087,
"loss": 1.3642,
"step": 952
},
{
"epoch": 0.1486768462723532,
"grad_norm": 0.34852224588394165,
"learning_rate": 0.00014199093732811225,
"loss": 1.4751,
"step": 953
},
{
"epoch": 0.14883285555490552,
"grad_norm": 0.2674144208431244,
"learning_rate": 0.00014187332675032188,
"loss": 1.2941,
"step": 954
},
{
"epoch": 0.14898886483745782,
"grad_norm": 0.30863744020462036,
"learning_rate": 0.00014175564589545854,
"loss": 1.298,
"step": 955
},
{
"epoch": 0.14914487412001015,
"grad_norm": 0.26412221789360046,
"learning_rate": 0.00014163789496102902,
"loss": 1.218,
"step": 956
},
{
"epoch": 0.14930088340256245,
"grad_norm": 0.2920873761177063,
"learning_rate": 0.0001415200741446577,
"loss": 1.5198,
"step": 957
},
{
"epoch": 0.14945689268511475,
"grad_norm": 0.29869547486305237,
"learning_rate": 0.00014140218364408632,
"loss": 1.3896,
"step": 958
},
{
"epoch": 0.14961290196766708,
"grad_norm": 0.2696417570114136,
"learning_rate": 0.00014128422365717347,
"loss": 1.2046,
"step": 959
},
{
"epoch": 0.14976891125021938,
"grad_norm": 0.27298402786254883,
"learning_rate": 0.0001411661943818944,
"loss": 1.3599,
"step": 960
},
{
"epoch": 0.1499249205327717,
"grad_norm": 0.27962544560432434,
"learning_rate": 0.0001410480960163407,
"loss": 1.25,
"step": 961
},
{
"epoch": 0.150080929815324,
"grad_norm": 0.2612510323524475,
"learning_rate": 0.00014092992875871979,
"loss": 1.1053,
"step": 962
},
{
"epoch": 0.15023693909787633,
"grad_norm": 0.27618667483329773,
"learning_rate": 0.00014081169280735488,
"loss": 1.3871,
"step": 963
},
{
"epoch": 0.15039294838042863,
"grad_norm": 0.24976608157157898,
"learning_rate": 0.00014069338836068433,
"loss": 1.2613,
"step": 964
},
{
"epoch": 0.15054895766298096,
"grad_norm": 0.267610102891922,
"learning_rate": 0.00014057501561726157,
"loss": 1.0631,
"step": 965
},
{
"epoch": 0.15070496694553326,
"grad_norm": 0.29677531123161316,
"learning_rate": 0.00014045657477575448,
"loss": 1.3567,
"step": 966
},
{
"epoch": 0.15086097622808556,
"grad_norm": 0.29539185762405396,
"learning_rate": 0.0001403380660349455,
"loss": 1.1386,
"step": 967
},
{
"epoch": 0.15101698551063789,
"grad_norm": 0.2691122889518738,
"learning_rate": 0.00014021948959373076,
"loss": 1.1089,
"step": 968
},
{
"epoch": 0.15117299479319019,
"grad_norm": 0.24394790828227997,
"learning_rate": 0.0001401008456511202,
"loss": 1.1893,
"step": 969
},
{
"epoch": 0.1513290040757425,
"grad_norm": 0.2849481403827667,
"learning_rate": 0.0001399821344062369,
"loss": 1.4775,
"step": 970
},
{
"epoch": 0.1514850133582948,
"grad_norm": 0.2634568512439728,
"learning_rate": 0.00013986335605831705,
"loss": 1.1655,
"step": 971
},
{
"epoch": 0.15164102264084714,
"grad_norm": 0.269879013299942,
"learning_rate": 0.00013974451080670934,
"loss": 1.2047,
"step": 972
},
{
"epoch": 0.15179703192339944,
"grad_norm": 0.27636033296585083,
"learning_rate": 0.0001396255988508748,
"loss": 1.2987,
"step": 973
},
{
"epoch": 0.15195304120595177,
"grad_norm": 0.2572225332260132,
"learning_rate": 0.00013950662039038643,
"loss": 1.3322,
"step": 974
},
{
"epoch": 0.15210905048850407,
"grad_norm": 0.2573801279067993,
"learning_rate": 0.00013938757562492873,
"loss": 1.2547,
"step": 975
},
{
"epoch": 0.15226505977105637,
"grad_norm": 0.3160158395767212,
"learning_rate": 0.00013926846475429766,
"loss": 1.5537,
"step": 976
},
{
"epoch": 0.1524210690536087,
"grad_norm": 0.30125337839126587,
"learning_rate": 0.00013914928797839995,
"loss": 1.0853,
"step": 977
},
{
"epoch": 0.152577078336161,
"grad_norm": 0.25772640109062195,
"learning_rate": 0.0001390300454972531,
"loss": 1.198,
"step": 978
},
{
"epoch": 0.15273308761871332,
"grad_norm": 0.257586270570755,
"learning_rate": 0.0001389107375109848,
"loss": 1.086,
"step": 979
},
{
"epoch": 0.15288909690126562,
"grad_norm": 0.2763863205909729,
"learning_rate": 0.00013879136421983266,
"loss": 1.2639,
"step": 980
},
{
"epoch": 0.15304510618381795,
"grad_norm": 0.2751125991344452,
"learning_rate": 0.00013867192582414393,
"loss": 1.2473,
"step": 981
},
{
"epoch": 0.15320111546637025,
"grad_norm": 0.3138543367385864,
"learning_rate": 0.0001385524225243751,
"loss": 1.3107,
"step": 982
},
{
"epoch": 0.15335712474892257,
"grad_norm": 0.27820733189582825,
"learning_rate": 0.00013843285452109166,
"loss": 1.048,
"step": 983
},
{
"epoch": 0.15351313403147487,
"grad_norm": 0.25756746530532837,
"learning_rate": 0.00013831322201496757,
"loss": 1.0374,
"step": 984
},
{
"epoch": 0.15366914331402717,
"grad_norm": 0.332603394985199,
"learning_rate": 0.0001381935252067852,
"loss": 1.3359,
"step": 985
},
{
"epoch": 0.1538251525965795,
"grad_norm": 0.33936744928359985,
"learning_rate": 0.00013807376429743467,
"loss": 1.5814,
"step": 986
},
{
"epoch": 0.1539811618791318,
"grad_norm": 0.2748062014579773,
"learning_rate": 0.00013795393948791383,
"loss": 1.201,
"step": 987
},
{
"epoch": 0.15413717116168413,
"grad_norm": 0.26038771867752075,
"learning_rate": 0.0001378340509793277,
"loss": 1.2087,
"step": 988
},
{
"epoch": 0.15429318044423643,
"grad_norm": 0.24746748805046082,
"learning_rate": 0.00013771409897288822,
"loss": 1.0487,
"step": 989
},
{
"epoch": 0.15444918972678875,
"grad_norm": 0.270280122756958,
"learning_rate": 0.0001375940836699139,
"loss": 1.1529,
"step": 990
},
{
"epoch": 0.15460519900934105,
"grad_norm": 0.28278234601020813,
"learning_rate": 0.00013747400527182953,
"loss": 1.4292,
"step": 991
},
{
"epoch": 0.15476120829189335,
"grad_norm": 0.3091171681880951,
"learning_rate": 0.0001373538639801657,
"loss": 1.2118,
"step": 992
},
{
"epoch": 0.15491721757444568,
"grad_norm": 0.264275461435318,
"learning_rate": 0.0001372336599965586,
"loss": 1.2727,
"step": 993
},
{
"epoch": 0.15507322685699798,
"grad_norm": 0.3125738799571991,
"learning_rate": 0.00013711339352274966,
"loss": 1.3389,
"step": 994
},
{
"epoch": 0.1552292361395503,
"grad_norm": 0.2750801146030426,
"learning_rate": 0.0001369930647605852,
"loss": 1.1031,
"step": 995
},
{
"epoch": 0.1553852454221026,
"grad_norm": 0.274777889251709,
"learning_rate": 0.00013687267391201605,
"loss": 1.4329,
"step": 996
},
{
"epoch": 0.15554125470465494,
"grad_norm": 0.28475117683410645,
"learning_rate": 0.00013675222117909717,
"loss": 1.1914,
"step": 997
},
{
"epoch": 0.15569726398720724,
"grad_norm": 0.27364879846572876,
"learning_rate": 0.00013663170676398752,
"loss": 1.1511,
"step": 998
},
{
"epoch": 0.15585327326975956,
"grad_norm": 0.310995489358902,
"learning_rate": 0.00013651113086894952,
"loss": 1.0349,
"step": 999
},
{
"epoch": 0.15600928255231186,
"grad_norm": 0.2910314202308655,
"learning_rate": 0.00013639049369634876,
"loss": 1.3302,
"step": 1000
},
{
"epoch": 0.15600928255231186,
"eval_loss": 1.2771576642990112,
"eval_runtime": 110.8263,
"eval_samples_per_second": 38.556,
"eval_steps_per_second": 4.827,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.144559113202074e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}