Gemma_medner_finetuned_lora / trainer_state.json
emilykang's picture
Model save
27364f9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.99000999000999,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01998001998001998,
"grad_norm": 1.9170171022415161,
"learning_rate": 0.0001999980260856137,
"loss": 1.4846,
"step": 10
},
{
"epoch": 0.03996003996003996,
"grad_norm": 0.5531741976737976,
"learning_rate": 0.00019999210442038162,
"loss": 1.0709,
"step": 20
},
{
"epoch": 0.059940059940059943,
"grad_norm": 0.39242061972618103,
"learning_rate": 0.0001999822352380809,
"loss": 0.9892,
"step": 30
},
{
"epoch": 0.07992007992007992,
"grad_norm": 0.46874135732650757,
"learning_rate": 0.00019996841892833,
"loss": 0.9705,
"step": 40
},
{
"epoch": 0.0999000999000999,
"grad_norm": 0.4039924740791321,
"learning_rate": 0.00019995065603657316,
"loss": 0.953,
"step": 50
},
{
"epoch": 0.11988011988011989,
"grad_norm": 0.37654027342796326,
"learning_rate": 0.00019992894726405893,
"loss": 0.9138,
"step": 60
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.41351592540740967,
"learning_rate": 0.0001999032934678125,
"loss": 0.9121,
"step": 70
},
{
"epoch": 0.15984015984015984,
"grad_norm": 0.42355260252952576,
"learning_rate": 0.00019987369566060176,
"loss": 0.8971,
"step": 80
},
{
"epoch": 0.1798201798201798,
"grad_norm": 0.40265560150146484,
"learning_rate": 0.00019984015501089752,
"loss": 0.892,
"step": 90
},
{
"epoch": 0.1998001998001998,
"grad_norm": 0.36668843030929565,
"learning_rate": 0.00019980267284282717,
"loss": 0.8907,
"step": 100
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.3516446352005005,
"learning_rate": 0.00019976125063612252,
"loss": 0.888,
"step": 110
},
{
"epoch": 0.23976023976023977,
"grad_norm": 0.3761754631996155,
"learning_rate": 0.0001997158900260614,
"loss": 0.8883,
"step": 120
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.3486793041229248,
"learning_rate": 0.00019966659280340297,
"loss": 0.8709,
"step": 130
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.39413630962371826,
"learning_rate": 0.00019961336091431727,
"loss": 0.8544,
"step": 140
},
{
"epoch": 0.2997002997002997,
"grad_norm": 0.3653990924358368,
"learning_rate": 0.00019955619646030802,
"loss": 0.8647,
"step": 150
},
{
"epoch": 0.3196803196803197,
"grad_norm": 0.4523209035396576,
"learning_rate": 0.00019949510169813003,
"loss": 0.8698,
"step": 160
},
{
"epoch": 0.33966033966033965,
"grad_norm": 0.3841874897480011,
"learning_rate": 0.0001994300790396999,
"loss": 0.8513,
"step": 170
},
{
"epoch": 0.3596403596403596,
"grad_norm": 0.3849908709526062,
"learning_rate": 0.00019936113105200085,
"loss": 0.8553,
"step": 180
},
{
"epoch": 0.37962037962037964,
"grad_norm": 0.3563358783721924,
"learning_rate": 0.00019928826045698136,
"loss": 0.8615,
"step": 190
},
{
"epoch": 0.3996003996003996,
"grad_norm": 0.3968392610549927,
"learning_rate": 0.0001992114701314478,
"loss": 0.8502,
"step": 200
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.366230845451355,
"learning_rate": 0.00019913076310695068,
"loss": 0.8368,
"step": 210
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.38813525438308716,
"learning_rate": 0.00019904614256966512,
"loss": 0.862,
"step": 220
},
{
"epoch": 0.4595404595404595,
"grad_norm": 0.35268592834472656,
"learning_rate": 0.0001989576118602651,
"loss": 0.8468,
"step": 230
},
{
"epoch": 0.47952047952047955,
"grad_norm": 0.342580109834671,
"learning_rate": 0.0001988651744737914,
"loss": 0.8575,
"step": 240
},
{
"epoch": 0.4995004995004995,
"grad_norm": 0.37153083086013794,
"learning_rate": 0.00019876883405951377,
"loss": 0.8374,
"step": 250
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.3486216366291046,
"learning_rate": 0.0001986685944207868,
"loss": 0.8333,
"step": 260
},
{
"epoch": 0.5394605394605395,
"grad_norm": 0.3562557101249695,
"learning_rate": 0.00019856445951489982,
"loss": 0.8238,
"step": 270
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.3600502610206604,
"learning_rate": 0.00019845643345292054,
"loss": 0.8331,
"step": 280
},
{
"epoch": 0.5794205794205795,
"grad_norm": 0.3475654423236847,
"learning_rate": 0.00019834452049953297,
"loss": 0.8093,
"step": 290
},
{
"epoch": 0.5994005994005994,
"grad_norm": 0.358980655670166,
"learning_rate": 0.0001982287250728689,
"loss": 0.8302,
"step": 300
},
{
"epoch": 0.6193806193806194,
"grad_norm": 0.3721815347671509,
"learning_rate": 0.0001981090517443334,
"loss": 0.8175,
"step": 310
},
{
"epoch": 0.6393606393606394,
"grad_norm": 0.35128098726272583,
"learning_rate": 0.0001979855052384247,
"loss": 0.8193,
"step": 320
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.3471618592739105,
"learning_rate": 0.00019785809043254722,
"loss": 0.8232,
"step": 330
},
{
"epoch": 0.6793206793206793,
"grad_norm": 0.35060420632362366,
"learning_rate": 0.00019772681235681936,
"loss": 0.8194,
"step": 340
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.3695327341556549,
"learning_rate": 0.00019759167619387476,
"loss": 0.806,
"step": 350
},
{
"epoch": 0.7192807192807192,
"grad_norm": 0.35857513546943665,
"learning_rate": 0.00019745268727865774,
"loss": 0.8019,
"step": 360
},
{
"epoch": 0.7392607392607392,
"grad_norm": 0.3612421154975891,
"learning_rate": 0.00019730985109821266,
"loss": 0.8061,
"step": 370
},
{
"epoch": 0.7592407592407593,
"grad_norm": 0.34007078409194946,
"learning_rate": 0.0001971631732914674,
"loss": 0.7919,
"step": 380
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.3594492971897125,
"learning_rate": 0.0001970126596490106,
"loss": 0.7821,
"step": 390
},
{
"epoch": 0.7992007992007992,
"grad_norm": 0.37426885962486267,
"learning_rate": 0.0001968583161128631,
"loss": 0.8054,
"step": 400
},
{
"epoch": 0.8191808191808192,
"grad_norm": 0.3551250398159027,
"learning_rate": 0.00019670014877624353,
"loss": 0.7954,
"step": 410
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.35951119661331177,
"learning_rate": 0.0001965381638833274,
"loss": 0.7966,
"step": 420
},
{
"epoch": 0.8591408591408591,
"grad_norm": 0.36964887380599976,
"learning_rate": 0.000196372367829001,
"loss": 0.7888,
"step": 430
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.36829873919487,
"learning_rate": 0.0001962027671586086,
"loss": 0.7902,
"step": 440
},
{
"epoch": 0.8991008991008991,
"grad_norm": 0.34358304738998413,
"learning_rate": 0.0001960293685676943,
"loss": 0.7733,
"step": 450
},
{
"epoch": 0.919080919080919,
"grad_norm": 0.37369629740715027,
"learning_rate": 0.0001958521789017376,
"loss": 0.796,
"step": 460
},
{
"epoch": 0.939060939060939,
"grad_norm": 0.40985429286956787,
"learning_rate": 0.00019567120515588308,
"loss": 0.7931,
"step": 470
},
{
"epoch": 0.9590409590409591,
"grad_norm": 0.34838569164276123,
"learning_rate": 0.00019548645447466431,
"loss": 0.7682,
"step": 480
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.36467525362968445,
"learning_rate": 0.00019529793415172192,
"loss": 0.7781,
"step": 490
},
{
"epoch": 0.999000999000999,
"grad_norm": 0.37112316489219666,
"learning_rate": 0.00019510565162951537,
"loss": 0.7773,
"step": 500
},
{
"epoch": 1.018981018981019,
"grad_norm": 0.3998737931251526,
"learning_rate": 0.00019490961449902946,
"loss": 0.7324,
"step": 510
},
{
"epoch": 1.0389610389610389,
"grad_norm": 0.3966336250305176,
"learning_rate": 0.00019470983049947444,
"loss": 0.7395,
"step": 520
},
{
"epoch": 1.058941058941059,
"grad_norm": 0.39721325039863586,
"learning_rate": 0.00019450630751798048,
"loss": 0.7302,
"step": 530
},
{
"epoch": 1.078921078921079,
"grad_norm": 0.38532692193984985,
"learning_rate": 0.00019429905358928646,
"loss": 0.7177,
"step": 540
},
{
"epoch": 1.098901098901099,
"grad_norm": 0.3948540985584259,
"learning_rate": 0.00019408807689542257,
"loss": 0.7382,
"step": 550
},
{
"epoch": 1.118881118881119,
"grad_norm": 0.399676650762558,
"learning_rate": 0.00019387338576538744,
"loss": 0.7286,
"step": 560
},
{
"epoch": 1.138861138861139,
"grad_norm": 0.4208274781703949,
"learning_rate": 0.00019365498867481923,
"loss": 0.7251,
"step": 570
},
{
"epoch": 1.158841158841159,
"grad_norm": 0.4160782992839813,
"learning_rate": 0.00019343289424566122,
"loss": 0.7138,
"step": 580
},
{
"epoch": 1.1788211788211789,
"grad_norm": 0.4297160804271698,
"learning_rate": 0.0001932071112458211,
"loss": 0.7296,
"step": 590
},
{
"epoch": 1.1988011988011988,
"grad_norm": 0.4196039140224457,
"learning_rate": 0.00019297764858882514,
"loss": 0.7091,
"step": 600
},
{
"epoch": 1.2187812187812188,
"grad_norm": 0.406012624502182,
"learning_rate": 0.00019274451533346615,
"loss": 0.7021,
"step": 610
},
{
"epoch": 1.2387612387612388,
"grad_norm": 0.41200658679008484,
"learning_rate": 0.0001925077206834458,
"loss": 0.7238,
"step": 620
},
{
"epoch": 1.2587412587412588,
"grad_norm": 0.4819345772266388,
"learning_rate": 0.0001922672739870115,
"loss": 0.7275,
"step": 630
},
{
"epoch": 1.2787212787212787,
"grad_norm": 0.40825748443603516,
"learning_rate": 0.00019202318473658705,
"loss": 0.7183,
"step": 640
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.41940203309059143,
"learning_rate": 0.00019177546256839812,
"loss": 0.7149,
"step": 650
},
{
"epoch": 1.3186813186813187,
"grad_norm": 0.40075168013572693,
"learning_rate": 0.00019152411726209176,
"loss": 0.722,
"step": 660
},
{
"epoch": 1.3386613386613386,
"grad_norm": 0.4254063665866852,
"learning_rate": 0.0001912691587403503,
"loss": 0.7254,
"step": 670
},
{
"epoch": 1.3586413586413586,
"grad_norm": 0.39732539653778076,
"learning_rate": 0.00019101059706849957,
"loss": 0.7115,
"step": 680
},
{
"epoch": 1.3786213786213786,
"grad_norm": 0.3889389932155609,
"learning_rate": 0.0001907484424541117,
"loss": 0.7031,
"step": 690
},
{
"epoch": 1.3986013986013985,
"grad_norm": 0.3994196355342865,
"learning_rate": 0.00019048270524660196,
"loss": 0.7095,
"step": 700
},
{
"epoch": 1.4185814185814185,
"grad_norm": 0.4238826036453247,
"learning_rate": 0.00019021339593682028,
"loss": 0.7156,
"step": 710
},
{
"epoch": 1.4385614385614387,
"grad_norm": 0.4787987172603607,
"learning_rate": 0.0001899405251566371,
"loss": 0.7142,
"step": 720
},
{
"epoch": 1.4585414585414584,
"grad_norm": 0.4219954013824463,
"learning_rate": 0.00018966410367852362,
"loss": 0.7267,
"step": 730
},
{
"epoch": 1.4785214785214786,
"grad_norm": 0.4154765009880066,
"learning_rate": 0.0001893841424151264,
"loss": 0.721,
"step": 740
},
{
"epoch": 1.4985014985014984,
"grad_norm": 0.44605547189712524,
"learning_rate": 0.0001891006524188368,
"loss": 0.7266,
"step": 750
},
{
"epoch": 1.5184815184815186,
"grad_norm": 0.4613310992717743,
"learning_rate": 0.00018881364488135448,
"loss": 0.7253,
"step": 760
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.41615426540374756,
"learning_rate": 0.00018852313113324552,
"loss": 0.69,
"step": 770
},
{
"epoch": 1.5584415584415585,
"grad_norm": 0.4512516260147095,
"learning_rate": 0.00018822912264349534,
"loss": 0.7124,
"step": 780
},
{
"epoch": 1.5784215784215783,
"grad_norm": 0.464336633682251,
"learning_rate": 0.00018793163101905563,
"loss": 0.7067,
"step": 790
},
{
"epoch": 1.5984015984015985,
"grad_norm": 0.4427087604999542,
"learning_rate": 0.00018763066800438636,
"loss": 0.7097,
"step": 800
},
{
"epoch": 1.6183816183816184,
"grad_norm": 0.43341028690338135,
"learning_rate": 0.00018732624548099204,
"loss": 0.7068,
"step": 810
},
{
"epoch": 1.6383616383616384,
"grad_norm": 0.4100460112094879,
"learning_rate": 0.0001870183754669526,
"loss": 0.705,
"step": 820
},
{
"epoch": 1.6583416583416584,
"grad_norm": 0.43942147493362427,
"learning_rate": 0.000186707070116449,
"loss": 0.7043,
"step": 830
},
{
"epoch": 1.6783216783216783,
"grad_norm": 0.430095911026001,
"learning_rate": 0.00018639234171928353,
"loss": 0.6989,
"step": 840
},
{
"epoch": 1.6983016983016983,
"grad_norm": 0.40418198704719543,
"learning_rate": 0.0001860742027003944,
"loss": 0.6933,
"step": 850
},
{
"epoch": 1.7182817182817183,
"grad_norm": 0.40910184383392334,
"learning_rate": 0.00018575266561936523,
"loss": 0.6848,
"step": 860
},
{
"epoch": 1.7382617382617382,
"grad_norm": 0.4620640277862549,
"learning_rate": 0.0001854277431699295,
"loss": 0.6943,
"step": 870
},
{
"epoch": 1.7582417582417582,
"grad_norm": 0.4648028314113617,
"learning_rate": 0.00018509944817946922,
"loss": 0.6993,
"step": 880
},
{
"epoch": 1.7782217782217782,
"grad_norm": 0.43752139806747437,
"learning_rate": 0.00018476779360850832,
"loss": 0.6827,
"step": 890
},
{
"epoch": 1.7982017982017982,
"grad_norm": 0.4481639862060547,
"learning_rate": 0.00018443279255020152,
"loss": 0.6978,
"step": 900
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.4678110182285309,
"learning_rate": 0.00018409445822981693,
"loss": 0.6848,
"step": 910
},
{
"epoch": 1.838161838161838,
"grad_norm": 0.433933824300766,
"learning_rate": 0.0001837528040042142,
"loss": 0.658,
"step": 920
},
{
"epoch": 1.8581418581418583,
"grad_norm": 0.4601323902606964,
"learning_rate": 0.00018340784336131713,
"loss": 0.6912,
"step": 930
},
{
"epoch": 1.878121878121878,
"grad_norm": 0.4591493308544159,
"learning_rate": 0.00018305958991958127,
"loss": 0.697,
"step": 940
},
{
"epoch": 1.8981018981018982,
"grad_norm": 0.445711225271225,
"learning_rate": 0.00018270805742745617,
"loss": 0.6922,
"step": 950
},
{
"epoch": 1.918081918081918,
"grad_norm": 0.43125954270362854,
"learning_rate": 0.00018235325976284275,
"loss": 0.6742,
"step": 960
},
{
"epoch": 1.9380619380619382,
"grad_norm": 0.4716484248638153,
"learning_rate": 0.00018199521093254523,
"loss": 0.6796,
"step": 970
},
{
"epoch": 1.958041958041958,
"grad_norm": 0.4613405764102936,
"learning_rate": 0.00018163392507171842,
"loss": 0.6832,
"step": 980
},
{
"epoch": 1.978021978021978,
"grad_norm": 0.48080363869667053,
"learning_rate": 0.0001812694164433094,
"loss": 0.6807,
"step": 990
},
{
"epoch": 1.9980019980019978,
"grad_norm": 0.47017648816108704,
"learning_rate": 0.00018090169943749476,
"loss": 0.6785,
"step": 1000
},
{
"epoch": 2.017982017982018,
"grad_norm": 0.516197919845581,
"learning_rate": 0.0001805307885711122,
"loss": 0.6019,
"step": 1010
},
{
"epoch": 2.037962037962038,
"grad_norm": 0.5556052923202515,
"learning_rate": 0.00018015669848708767,
"loss": 0.5906,
"step": 1020
},
{
"epoch": 2.057942057942058,
"grad_norm": 0.5169907808303833,
"learning_rate": 0.0001797794439538571,
"loss": 0.6076,
"step": 1030
},
{
"epoch": 2.0779220779220777,
"grad_norm": 0.5560281276702881,
"learning_rate": 0.00017939903986478355,
"loss": 0.582,
"step": 1040
},
{
"epoch": 2.097902097902098,
"grad_norm": 0.521091878414154,
"learning_rate": 0.00017901550123756906,
"loss": 0.5929,
"step": 1050
},
{
"epoch": 2.117882117882118,
"grad_norm": 0.5990195870399475,
"learning_rate": 0.00017862884321366188,
"loss": 0.5863,
"step": 1060
},
{
"epoch": 2.137862137862138,
"grad_norm": 0.5285313725471497,
"learning_rate": 0.0001782390810576588,
"loss": 0.5845,
"step": 1070
},
{
"epoch": 2.157842157842158,
"grad_norm": 0.5402159690856934,
"learning_rate": 0.00017784623015670238,
"loss": 0.5926,
"step": 1080
},
{
"epoch": 2.177822177822178,
"grad_norm": 0.5576025247573853,
"learning_rate": 0.00017745030601987337,
"loss": 0.5964,
"step": 1090
},
{
"epoch": 2.197802197802198,
"grad_norm": 0.5605506896972656,
"learning_rate": 0.00017705132427757895,
"loss": 0.5877,
"step": 1100
},
{
"epoch": 2.2177822177822177,
"grad_norm": 0.5754747986793518,
"learning_rate": 0.00017664930068093498,
"loss": 0.6002,
"step": 1110
},
{
"epoch": 2.237762237762238,
"grad_norm": 0.5654470324516296,
"learning_rate": 0.0001762442511011448,
"loss": 0.5922,
"step": 1120
},
{
"epoch": 2.2577422577422577,
"grad_norm": 0.5414491891860962,
"learning_rate": 0.0001758361915288722,
"loss": 0.5917,
"step": 1130
},
{
"epoch": 2.277722277722278,
"grad_norm": 0.5563125014305115,
"learning_rate": 0.00017542513807361037,
"loss": 0.5867,
"step": 1140
},
{
"epoch": 2.2977022977022976,
"grad_norm": 0.5236257314682007,
"learning_rate": 0.00017501110696304596,
"loss": 0.5888,
"step": 1150
},
{
"epoch": 2.317682317682318,
"grad_norm": 0.614734411239624,
"learning_rate": 0.00017459411454241822,
"loss": 0.6001,
"step": 1160
},
{
"epoch": 2.3376623376623376,
"grad_norm": 0.605421781539917,
"learning_rate": 0.00017417417727387394,
"loss": 0.5968,
"step": 1170
},
{
"epoch": 2.3576423576423577,
"grad_norm": 0.5595569014549255,
"learning_rate": 0.0001737513117358174,
"loss": 0.5924,
"step": 1180
},
{
"epoch": 2.3776223776223775,
"grad_norm": 0.5283003449440002,
"learning_rate": 0.00017332553462225602,
"loss": 0.5952,
"step": 1190
},
{
"epoch": 2.3976023976023977,
"grad_norm": 0.5287072658538818,
"learning_rate": 0.00017289686274214118,
"loss": 0.5763,
"step": 1200
},
{
"epoch": 2.4175824175824174,
"grad_norm": 0.5907203555107117,
"learning_rate": 0.0001724653130187047,
"loss": 0.5993,
"step": 1210
},
{
"epoch": 2.4375624375624376,
"grad_norm": 0.5622738003730774,
"learning_rate": 0.0001720309024887907,
"loss": 0.6001,
"step": 1220
},
{
"epoch": 2.4575424575424574,
"grad_norm": 0.5795326232910156,
"learning_rate": 0.00017159364830218312,
"loss": 0.5857,
"step": 1230
},
{
"epoch": 2.4775224775224776,
"grad_norm": 0.5654671788215637,
"learning_rate": 0.00017115356772092857,
"loss": 0.5809,
"step": 1240
},
{
"epoch": 2.4975024975024973,
"grad_norm": 0.5641043186187744,
"learning_rate": 0.00017071067811865476,
"loss": 0.5824,
"step": 1250
},
{
"epoch": 2.5174825174825175,
"grad_norm": 0.5851653218269348,
"learning_rate": 0.00017026499697988493,
"loss": 0.59,
"step": 1260
},
{
"epoch": 2.5374625374625372,
"grad_norm": 0.570210337638855,
"learning_rate": 0.00016981654189934727,
"loss": 0.5761,
"step": 1270
},
{
"epoch": 2.5574425574425574,
"grad_norm": 0.5725647807121277,
"learning_rate": 0.0001693653305812805,
"loss": 0.589,
"step": 1280
},
{
"epoch": 2.5774225774225776,
"grad_norm": 0.5896579623222351,
"learning_rate": 0.00016891138083873487,
"loss": 0.5852,
"step": 1290
},
{
"epoch": 2.5974025974025974,
"grad_norm": 0.5988901853561401,
"learning_rate": 0.00016845471059286887,
"loss": 0.5723,
"step": 1300
},
{
"epoch": 2.617382617382617,
"grad_norm": 0.5854650735855103,
"learning_rate": 0.00016799533787224192,
"loss": 0.5845,
"step": 1310
},
{
"epoch": 2.6373626373626373,
"grad_norm": 0.5547802448272705,
"learning_rate": 0.00016753328081210245,
"loss": 0.5909,
"step": 1320
},
{
"epoch": 2.6573426573426575,
"grad_norm": 0.5562127232551575,
"learning_rate": 0.000167068557653672,
"loss": 0.5799,
"step": 1330
},
{
"epoch": 2.6773226773226773,
"grad_norm": 0.5999246835708618,
"learning_rate": 0.00016660118674342517,
"loss": 0.5757,
"step": 1340
},
{
"epoch": 2.6973026973026974,
"grad_norm": 0.5909945368766785,
"learning_rate": 0.00016613118653236518,
"loss": 0.5674,
"step": 1350
},
{
"epoch": 2.717282717282717,
"grad_norm": 0.6357455849647522,
"learning_rate": 0.00016565857557529566,
"loss": 0.5821,
"step": 1360
},
{
"epoch": 2.7372627372627374,
"grad_norm": 0.6019343733787537,
"learning_rate": 0.0001651833725300879,
"loss": 0.5783,
"step": 1370
},
{
"epoch": 2.757242757242757,
"grad_norm": 0.6180288791656494,
"learning_rate": 0.00016470559615694446,
"loss": 0.6056,
"step": 1380
},
{
"epoch": 2.7772227772227773,
"grad_norm": 0.6171667575836182,
"learning_rate": 0.00016422526531765846,
"loss": 0.5799,
"step": 1390
},
{
"epoch": 2.797202797202797,
"grad_norm": 0.5991246700286865,
"learning_rate": 0.000163742398974869,
"loss": 0.5668,
"step": 1400
},
{
"epoch": 2.8171828171828173,
"grad_norm": 0.6568031907081604,
"learning_rate": 0.00016325701619131246,
"loss": 0.5662,
"step": 1410
},
{
"epoch": 2.837162837162837,
"grad_norm": 0.6639891266822815,
"learning_rate": 0.00016276913612907007,
"loss": 0.5797,
"step": 1420
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.5978193879127502,
"learning_rate": 0.00016227877804881127,
"loss": 0.5613,
"step": 1430
},
{
"epoch": 2.8771228771228774,
"grad_norm": 0.576871395111084,
"learning_rate": 0.00016178596130903344,
"loss": 0.5796,
"step": 1440
},
{
"epoch": 2.897102897102897,
"grad_norm": 0.5936170220375061,
"learning_rate": 0.00016129070536529766,
"loss": 0.5791,
"step": 1450
},
{
"epoch": 2.917082917082917,
"grad_norm": 0.6093722581863403,
"learning_rate": 0.00016079302976946055,
"loss": 0.5836,
"step": 1460
},
{
"epoch": 2.937062937062937,
"grad_norm": 0.5815151929855347,
"learning_rate": 0.00016029295416890248,
"loss": 0.5644,
"step": 1470
},
{
"epoch": 2.9570429570429573,
"grad_norm": 0.621591329574585,
"learning_rate": 0.0001597904983057519,
"loss": 0.5779,
"step": 1480
},
{
"epoch": 2.977022977022977,
"grad_norm": 0.5824622511863708,
"learning_rate": 0.00015928568201610595,
"loss": 0.5659,
"step": 1490
},
{
"epoch": 2.9970029970029968,
"grad_norm": 0.6264435052871704,
"learning_rate": 0.00015877852522924732,
"loss": 0.5823,
"step": 1500
},
{
"epoch": 3.016983016983017,
"grad_norm": 0.7021110653877258,
"learning_rate": 0.00015826904796685762,
"loss": 0.4732,
"step": 1510
},
{
"epoch": 3.036963036963037,
"grad_norm": 0.7195537686347961,
"learning_rate": 0.00015775727034222675,
"loss": 0.4484,
"step": 1520
},
{
"epoch": 3.056943056943057,
"grad_norm": 0.7159614562988281,
"learning_rate": 0.0001572432125594591,
"loss": 0.4533,
"step": 1530
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.686655580997467,
"learning_rate": 0.00015672689491267567,
"loss": 0.4588,
"step": 1540
},
{
"epoch": 3.096903096903097,
"grad_norm": 0.6840978264808655,
"learning_rate": 0.00015620833778521307,
"loss": 0.4632,
"step": 1550
},
{
"epoch": 3.116883116883117,
"grad_norm": 0.6888960003852844,
"learning_rate": 0.00015568756164881882,
"loss": 0.463,
"step": 1560
},
{
"epoch": 3.136863136863137,
"grad_norm": 0.6887105107307434,
"learning_rate": 0.00015516458706284303,
"loss": 0.4683,
"step": 1570
},
{
"epoch": 3.156843156843157,
"grad_norm": 0.6880657076835632,
"learning_rate": 0.00015463943467342693,
"loss": 0.4703,
"step": 1580
},
{
"epoch": 3.1768231768231767,
"grad_norm": 0.667488157749176,
"learning_rate": 0.00015411212521268758,
"loss": 0.4681,
"step": 1590
},
{
"epoch": 3.196803196803197,
"grad_norm": 0.7201547026634216,
"learning_rate": 0.00015358267949789966,
"loss": 0.4708,
"step": 1600
},
{
"epoch": 3.2167832167832167,
"grad_norm": 0.7887006998062134,
"learning_rate": 0.0001530511184306734,
"loss": 0.4692,
"step": 1610
},
{
"epoch": 3.236763236763237,
"grad_norm": 0.6850538849830627,
"learning_rate": 0.0001525174629961296,
"loss": 0.4652,
"step": 1620
},
{
"epoch": 3.2567432567432566,
"grad_norm": 0.7573882937431335,
"learning_rate": 0.00015198173426207094,
"loss": 0.4618,
"step": 1630
},
{
"epoch": 3.276723276723277,
"grad_norm": 0.7027117609977722,
"learning_rate": 0.00015144395337815064,
"loss": 0.4665,
"step": 1640
},
{
"epoch": 3.2967032967032965,
"grad_norm": 0.6847530007362366,
"learning_rate": 0.00015090414157503714,
"loss": 0.4669,
"step": 1650
},
{
"epoch": 3.3166833166833167,
"grad_norm": 0.7099263072013855,
"learning_rate": 0.0001503623201635761,
"loss": 0.4666,
"step": 1660
},
{
"epoch": 3.3366633366633365,
"grad_norm": 0.6803727149963379,
"learning_rate": 0.0001498185105339491,
"loss": 0.4674,
"step": 1670
},
{
"epoch": 3.3566433566433567,
"grad_norm": 0.7080752849578857,
"learning_rate": 0.00014927273415482915,
"loss": 0.4694,
"step": 1680
},
{
"epoch": 3.3766233766233764,
"grad_norm": 0.7016042470932007,
"learning_rate": 0.00014872501257253323,
"loss": 0.4716,
"step": 1690
},
{
"epoch": 3.3966033966033966,
"grad_norm": 0.6896219849586487,
"learning_rate": 0.00014817536741017152,
"loss": 0.4706,
"step": 1700
},
{
"epoch": 3.416583416583417,
"grad_norm": 0.7319151163101196,
"learning_rate": 0.0001476238203667939,
"loss": 0.4657,
"step": 1710
},
{
"epoch": 3.4365634365634365,
"grad_norm": 0.7796220779418945,
"learning_rate": 0.0001470703932165333,
"loss": 0.4762,
"step": 1720
},
{
"epoch": 3.4565434565434563,
"grad_norm": 0.6749796271324158,
"learning_rate": 0.00014651510780774583,
"loss": 0.4602,
"step": 1730
},
{
"epoch": 3.4765234765234765,
"grad_norm": 0.6736605167388916,
"learning_rate": 0.00014595798606214882,
"loss": 0.4751,
"step": 1740
},
{
"epoch": 3.4965034965034967,
"grad_norm": 0.7386316657066345,
"learning_rate": 0.00014539904997395468,
"loss": 0.4658,
"step": 1750
},
{
"epoch": 3.5164835164835164,
"grad_norm": 0.7023107409477234,
"learning_rate": 0.00014483832160900326,
"loss": 0.4678,
"step": 1760
},
{
"epoch": 3.5364635364635366,
"grad_norm": 0.6938359141349792,
"learning_rate": 0.0001442758231038902,
"loss": 0.4619,
"step": 1770
},
{
"epoch": 3.5564435564435564,
"grad_norm": 0.7815272212028503,
"learning_rate": 0.0001437115766650933,
"loss": 0.4744,
"step": 1780
},
{
"epoch": 3.5764235764235766,
"grad_norm": 0.7307267189025879,
"learning_rate": 0.0001431456045680959,
"loss": 0.4767,
"step": 1790
},
{
"epoch": 3.5964035964035963,
"grad_norm": 0.6948580741882324,
"learning_rate": 0.00014257792915650728,
"loss": 0.4644,
"step": 1800
},
{
"epoch": 3.6163836163836165,
"grad_norm": 0.691348671913147,
"learning_rate": 0.00014200857284118066,
"loss": 0.4609,
"step": 1810
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.7828198671340942,
"learning_rate": 0.00014143755809932845,
"loss": 0.4506,
"step": 1820
},
{
"epoch": 3.6563436563436564,
"grad_norm": 0.73238205909729,
"learning_rate": 0.00014086490747363493,
"loss": 0.4599,
"step": 1830
},
{
"epoch": 3.676323676323676,
"grad_norm": 0.7216520309448242,
"learning_rate": 0.00014029064357136628,
"loss": 0.4582,
"step": 1840
},
{
"epoch": 3.6963036963036964,
"grad_norm": 0.7676394581794739,
"learning_rate": 0.00013971478906347806,
"loss": 0.4494,
"step": 1850
},
{
"epoch": 3.716283716283716,
"grad_norm": 0.7596750259399414,
"learning_rate": 0.00013913736668372026,
"loss": 0.4704,
"step": 1860
},
{
"epoch": 3.7362637362637363,
"grad_norm": 0.7686085104942322,
"learning_rate": 0.00013855839922773968,
"loss": 0.4603,
"step": 1870
},
{
"epoch": 3.756243756243756,
"grad_norm": 0.6850613951683044,
"learning_rate": 0.00013797790955218014,
"loss": 0.4503,
"step": 1880
},
{
"epoch": 3.7762237762237763,
"grad_norm": 0.721778392791748,
"learning_rate": 0.00013739592057378003,
"loss": 0.4713,
"step": 1890
},
{
"epoch": 3.7962037962037964,
"grad_norm": 0.7122541069984436,
"learning_rate": 0.00013681245526846783,
"loss": 0.4664,
"step": 1900
},
{
"epoch": 3.816183816183816,
"grad_norm": 0.7361748218536377,
"learning_rate": 0.00013622753667045457,
"loss": 0.4571,
"step": 1910
},
{
"epoch": 3.836163836163836,
"grad_norm": 0.8220844864845276,
"learning_rate": 0.00013564118787132506,
"loss": 0.4521,
"step": 1920
},
{
"epoch": 3.856143856143856,
"grad_norm": 0.7139246463775635,
"learning_rate": 0.0001350534320191259,
"loss": 0.4491,
"step": 1930
},
{
"epoch": 3.8761238761238763,
"grad_norm": 0.7244653701782227,
"learning_rate": 0.0001344642923174517,
"loss": 0.4552,
"step": 1940
},
{
"epoch": 3.896103896103896,
"grad_norm": 0.7056713700294495,
"learning_rate": 0.00013387379202452917,
"loss": 0.4548,
"step": 1950
},
{
"epoch": 3.916083916083916,
"grad_norm": 0.7653645277023315,
"learning_rate": 0.00013328195445229868,
"loss": 0.4492,
"step": 1960
},
{
"epoch": 3.936063936063936,
"grad_norm": 0.6818165183067322,
"learning_rate": 0.00013268880296549425,
"loss": 0.4463,
"step": 1970
},
{
"epoch": 3.956043956043956,
"grad_norm": 0.687439501285553,
"learning_rate": 0.00013209436098072095,
"loss": 0.457,
"step": 1980
},
{
"epoch": 3.976023976023976,
"grad_norm": 0.7704656720161438,
"learning_rate": 0.0001314986519655305,
"loss": 0.4522,
"step": 1990
},
{
"epoch": 3.996003996003996,
"grad_norm": 0.7227702736854553,
"learning_rate": 0.00013090169943749476,
"loss": 0.4454,
"step": 2000
},
{
"epoch": 4.015984015984016,
"grad_norm": 0.8689281344413757,
"learning_rate": 0.00013030352696327742,
"loss": 0.3645,
"step": 2010
},
{
"epoch": 4.035964035964036,
"grad_norm": 0.7620906829833984,
"learning_rate": 0.0001297041581577035,
"loss": 0.3478,
"step": 2020
},
{
"epoch": 4.055944055944056,
"grad_norm": 0.768671989440918,
"learning_rate": 0.00012910361668282719,
"loss": 0.3595,
"step": 2030
},
{
"epoch": 4.075924075924076,
"grad_norm": 0.7327402234077454,
"learning_rate": 0.0001285019262469976,
"loss": 0.3471,
"step": 2040
},
{
"epoch": 4.095904095904096,
"grad_norm": 0.6913720965385437,
"learning_rate": 0.00012789911060392294,
"loss": 0.3501,
"step": 2050
},
{
"epoch": 4.115884115884116,
"grad_norm": 0.7310584783554077,
"learning_rate": 0.00012729519355173254,
"loss": 0.3509,
"step": 2060
},
{
"epoch": 4.135864135864136,
"grad_norm": 0.7578213214874268,
"learning_rate": 0.00012669019893203759,
"loss": 0.3506,
"step": 2070
},
{
"epoch": 4.1558441558441555,
"grad_norm": 0.7301665544509888,
"learning_rate": 0.00012608415062898972,
"loss": 0.3536,
"step": 2080
},
{
"epoch": 4.175824175824176,
"grad_norm": 0.8198577165603638,
"learning_rate": 0.00012547707256833823,
"loss": 0.3578,
"step": 2090
},
{
"epoch": 4.195804195804196,
"grad_norm": 0.7331268787384033,
"learning_rate": 0.0001248689887164855,
"loss": 0.3508,
"step": 2100
},
{
"epoch": 4.215784215784216,
"grad_norm": 0.7666186094284058,
"learning_rate": 0.00012425992307954075,
"loss": 0.3468,
"step": 2110
},
{
"epoch": 4.235764235764236,
"grad_norm": 0.7020666599273682,
"learning_rate": 0.00012364989970237248,
"loss": 0.3586,
"step": 2120
},
{
"epoch": 4.255744255744256,
"grad_norm": 0.7276338338851929,
"learning_rate": 0.00012303894266765908,
"loss": 0.3672,
"step": 2130
},
{
"epoch": 4.275724275724276,
"grad_norm": 0.6978778839111328,
"learning_rate": 0.00012242707609493814,
"loss": 0.3576,
"step": 2140
},
{
"epoch": 4.2957042957042955,
"grad_norm": 0.822030246257782,
"learning_rate": 0.00012181432413965428,
"loss": 0.3618,
"step": 2150
},
{
"epoch": 4.315684315684316,
"grad_norm": 0.744611918926239,
"learning_rate": 0.00012120071099220549,
"loss": 0.3578,
"step": 2160
},
{
"epoch": 4.335664335664336,
"grad_norm": 0.7712835669517517,
"learning_rate": 0.00012058626087698814,
"loss": 0.3632,
"step": 2170
},
{
"epoch": 4.355644355644356,
"grad_norm": 0.7824398279190063,
"learning_rate": 0.00011997099805144069,
"loss": 0.36,
"step": 2180
},
{
"epoch": 4.375624375624375,
"grad_norm": 0.8473492860794067,
"learning_rate": 0.00011935494680508606,
"loss": 0.3645,
"step": 2190
},
{
"epoch": 4.395604395604396,
"grad_norm": 0.7394326329231262,
"learning_rate": 0.00011873813145857249,
"loss": 0.3604,
"step": 2200
},
{
"epoch": 4.415584415584416,
"grad_norm": 0.763633131980896,
"learning_rate": 0.00011812057636271374,
"loss": 0.3634,
"step": 2210
},
{
"epoch": 4.4355644355644355,
"grad_norm": 0.7612594962120056,
"learning_rate": 0.00011750230589752762,
"loss": 0.355,
"step": 2220
},
{
"epoch": 4.455544455544455,
"grad_norm": 0.7789061665534973,
"learning_rate": 0.00011688334447127338,
"loss": 0.3629,
"step": 2230
},
{
"epoch": 4.475524475524476,
"grad_norm": 0.7422770261764526,
"learning_rate": 0.00011626371651948838,
"loss": 0.361,
"step": 2240
},
{
"epoch": 4.495504495504496,
"grad_norm": 0.7636354565620422,
"learning_rate": 0.0001156434465040231,
"loss": 0.3593,
"step": 2250
},
{
"epoch": 4.515484515484515,
"grad_norm": 0.7884863615036011,
"learning_rate": 0.00011502255891207572,
"loss": 0.3587,
"step": 2260
},
{
"epoch": 4.535464535464535,
"grad_norm": 0.7233232855796814,
"learning_rate": 0.00011440107825522521,
"loss": 0.3577,
"step": 2270
},
{
"epoch": 4.555444555444556,
"grad_norm": 0.8420186638832092,
"learning_rate": 0.0001137790290684638,
"loss": 0.3686,
"step": 2280
},
{
"epoch": 4.5754245754245755,
"grad_norm": 0.7679941654205322,
"learning_rate": 0.00011315643590922827,
"loss": 0.3539,
"step": 2290
},
{
"epoch": 4.595404595404595,
"grad_norm": 0.826885461807251,
"learning_rate": 0.00011253332335643043,
"loss": 0.3627,
"step": 2300
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.7590234875679016,
"learning_rate": 0.00011190971600948699,
"loss": 0.3613,
"step": 2310
},
{
"epoch": 4.635364635364636,
"grad_norm": 0.7376580238342285,
"learning_rate": 0.00011128563848734816,
"loss": 0.3694,
"step": 2320
},
{
"epoch": 4.655344655344655,
"grad_norm": 0.7795658111572266,
"learning_rate": 0.000110661115427526,
"loss": 0.3598,
"step": 2330
},
{
"epoch": 4.675324675324675,
"grad_norm": 0.7736489176750183,
"learning_rate": 0.00011003617148512149,
"loss": 0.3598,
"step": 2340
},
{
"epoch": 4.695304695304696,
"grad_norm": 0.757072925567627,
"learning_rate": 0.00010941083133185146,
"loss": 0.366,
"step": 2350
},
{
"epoch": 4.7152847152847155,
"grad_norm": 0.8167831301689148,
"learning_rate": 0.00010878511965507434,
"loss": 0.3633,
"step": 2360
},
{
"epoch": 4.735264735264735,
"grad_norm": 0.8083499670028687,
"learning_rate": 0.00010815906115681578,
"loss": 0.3562,
"step": 2370
},
{
"epoch": 4.755244755244755,
"grad_norm": 0.7758758068084717,
"learning_rate": 0.00010753268055279329,
"loss": 0.3614,
"step": 2380
},
{
"epoch": 4.775224775224775,
"grad_norm": 0.8572462797164917,
"learning_rate": 0.00010690600257144061,
"loss": 0.3652,
"step": 2390
},
{
"epoch": 4.795204795204795,
"grad_norm": 0.8319938778877258,
"learning_rate": 0.00010627905195293135,
"loss": 0.3622,
"step": 2400
},
{
"epoch": 4.815184815184815,
"grad_norm": 0.8004459142684937,
"learning_rate": 0.00010565185344820247,
"loss": 0.3604,
"step": 2410
},
{
"epoch": 4.835164835164835,
"grad_norm": 0.790908694267273,
"learning_rate": 0.00010502443181797697,
"loss": 0.3587,
"step": 2420
},
{
"epoch": 4.8551448551448555,
"grad_norm": 0.7726609110832214,
"learning_rate": 0.0001043968118317865,
"loss": 0.364,
"step": 2430
},
{
"epoch": 4.875124875124875,
"grad_norm": 0.7808167338371277,
"learning_rate": 0.00010376901826699348,
"loss": 0.3637,
"step": 2440
},
{
"epoch": 4.895104895104895,
"grad_norm": 0.8596636652946472,
"learning_rate": 0.00010314107590781284,
"loss": 0.3536,
"step": 2450
},
{
"epoch": 4.915084915084915,
"grad_norm": 0.8091081380844116,
"learning_rate": 0.00010251300954433376,
"loss": 0.3522,
"step": 2460
},
{
"epoch": 4.935064935064935,
"grad_norm": 0.8672420978546143,
"learning_rate": 0.00010188484397154084,
"loss": 0.3643,
"step": 2470
},
{
"epoch": 4.955044955044955,
"grad_norm": 0.7860444188117981,
"learning_rate": 0.00010125660398833528,
"loss": 0.3493,
"step": 2480
},
{
"epoch": 4.975024975024975,
"grad_norm": 0.7510725259780884,
"learning_rate": 0.00010062831439655591,
"loss": 0.3497,
"step": 2490
},
{
"epoch": 4.995004995004995,
"grad_norm": 0.7850112915039062,
"learning_rate": 0.0001,
"loss": 0.361,
"step": 2500
},
{
"epoch": 5.014985014985015,
"grad_norm": 0.9001740217208862,
"learning_rate": 9.937168560344412e-05,
"loss": 0.2983,
"step": 2510
},
{
"epoch": 5.034965034965035,
"grad_norm": 0.683803141117096,
"learning_rate": 9.874339601166473e-05,
"loss": 0.2805,
"step": 2520
},
{
"epoch": 5.054945054945055,
"grad_norm": 0.7267177700996399,
"learning_rate": 9.81151560284592e-05,
"loss": 0.2751,
"step": 2530
},
{
"epoch": 5.0749250749250745,
"grad_norm": 0.7268999814987183,
"learning_rate": 9.748699045566626e-05,
"loss": 0.2805,
"step": 2540
},
{
"epoch": 5.094905094905095,
"grad_norm": 0.6958262324333191,
"learning_rate": 9.685892409218717e-05,
"loss": 0.2808,
"step": 2550
},
{
"epoch": 5.114885114885115,
"grad_norm": 0.7481863498687744,
"learning_rate": 9.623098173300654e-05,
"loss": 0.2808,
"step": 2560
},
{
"epoch": 5.134865134865135,
"grad_norm": 0.6923096179962158,
"learning_rate": 9.560318816821353e-05,
"loss": 0.2802,
"step": 2570
},
{
"epoch": 5.154845154845155,
"grad_norm": 0.8236074447631836,
"learning_rate": 9.497556818202306e-05,
"loss": 0.2845,
"step": 2580
},
{
"epoch": 5.174825174825175,
"grad_norm": 0.7225534915924072,
"learning_rate": 9.434814655179755e-05,
"loss": 0.2802,
"step": 2590
},
{
"epoch": 5.194805194805195,
"grad_norm": 0.7639855146408081,
"learning_rate": 9.372094804706867e-05,
"loss": 0.2846,
"step": 2600
},
{
"epoch": 5.2147852147852145,
"grad_norm": 0.7572929859161377,
"learning_rate": 9.309399742855942e-05,
"loss": 0.2826,
"step": 2610
},
{
"epoch": 5.234765234765235,
"grad_norm": 0.8045923709869385,
"learning_rate": 9.246731944720675e-05,
"loss": 0.2862,
"step": 2620
},
{
"epoch": 5.254745254745255,
"grad_norm": 0.7385067939758301,
"learning_rate": 9.184093884318425e-05,
"loss": 0.2886,
"step": 2630
},
{
"epoch": 5.274725274725275,
"grad_norm": 0.7742624282836914,
"learning_rate": 9.121488034492569e-05,
"loss": 0.2857,
"step": 2640
},
{
"epoch": 5.294705294705294,
"grad_norm": 0.73873370885849,
"learning_rate": 9.058916866814858e-05,
"loss": 0.2874,
"step": 2650
},
{
"epoch": 5.314685314685315,
"grad_norm": 0.8087053298950195,
"learning_rate": 8.99638285148785e-05,
"loss": 0.2814,
"step": 2660
},
{
"epoch": 5.334665334665335,
"grad_norm": 0.7873129844665527,
"learning_rate": 8.933888457247402e-05,
"loss": 0.2827,
"step": 2670
},
{
"epoch": 5.3546453546453545,
"grad_norm": 0.776678204536438,
"learning_rate": 8.871436151265184e-05,
"loss": 0.2861,
"step": 2680
},
{
"epoch": 5.374625374625374,
"grad_norm": 0.7478957772254944,
"learning_rate": 8.809028399051302e-05,
"loss": 0.2841,
"step": 2690
},
{
"epoch": 5.394605394605395,
"grad_norm": 0.7491159439086914,
"learning_rate": 8.746667664356956e-05,
"loss": 0.2781,
"step": 2700
},
{
"epoch": 5.414585414585415,
"grad_norm": 0.7022270560264587,
"learning_rate": 8.684356409077176e-05,
"loss": 0.2831,
"step": 2710
},
{
"epoch": 5.434565434565434,
"grad_norm": 0.714643120765686,
"learning_rate": 8.62209709315362e-05,
"loss": 0.2816,
"step": 2720
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.7695267796516418,
"learning_rate": 8.559892174477479e-05,
"loss": 0.2845,
"step": 2730
},
{
"epoch": 5.474525474525475,
"grad_norm": 0.7670512795448303,
"learning_rate": 8.497744108792429e-05,
"loss": 0.284,
"step": 2740
},
{
"epoch": 5.4945054945054945,
"grad_norm": 0.7777095437049866,
"learning_rate": 8.435655349597689e-05,
"loss": 0.2849,
"step": 2750
},
{
"epoch": 5.514485514485514,
"grad_norm": 0.7117462158203125,
"learning_rate": 8.373628348051165e-05,
"loss": 0.2892,
"step": 2760
},
{
"epoch": 5.534465534465534,
"grad_norm": 0.7786485552787781,
"learning_rate": 8.311665552872662e-05,
"loss": 0.2867,
"step": 2770
},
{
"epoch": 5.554445554445555,
"grad_norm": 0.7926625609397888,
"learning_rate": 8.249769410247239e-05,
"loss": 0.2862,
"step": 2780
},
{
"epoch": 5.574425574425574,
"grad_norm": 0.7426894307136536,
"learning_rate": 8.187942363728625e-05,
"loss": 0.288,
"step": 2790
},
{
"epoch": 5.594405594405594,
"grad_norm": 0.7075335383415222,
"learning_rate": 8.126186854142752e-05,
"loss": 0.2847,
"step": 2800
},
{
"epoch": 5.614385614385615,
"grad_norm": 0.7743814587593079,
"learning_rate": 8.064505319491398e-05,
"loss": 0.2912,
"step": 2810
},
{
"epoch": 5.6343656343656345,
"grad_norm": 0.7679479122161865,
"learning_rate": 8.002900194855932e-05,
"loss": 0.2944,
"step": 2820
},
{
"epoch": 5.654345654345654,
"grad_norm": 0.8007961511611938,
"learning_rate": 7.941373912301189e-05,
"loss": 0.2934,
"step": 2830
},
{
"epoch": 5.674325674325674,
"grad_norm": 0.8405194878578186,
"learning_rate": 7.879928900779456e-05,
"loss": 0.2848,
"step": 2840
},
{
"epoch": 5.694305694305695,
"grad_norm": 0.7828160524368286,
"learning_rate": 7.818567586034577e-05,
"loss": 0.2932,
"step": 2850
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.7869848608970642,
"learning_rate": 7.75729239050619e-05,
"loss": 0.2851,
"step": 2860
},
{
"epoch": 5.734265734265734,
"grad_norm": 0.7781445980072021,
"learning_rate": 7.696105733234098e-05,
"loss": 0.2849,
"step": 2870
},
{
"epoch": 5.754245754245754,
"grad_norm": 0.8406656980514526,
"learning_rate": 7.635010029762756e-05,
"loss": 0.2854,
"step": 2880
},
{
"epoch": 5.7742257742257745,
"grad_norm": 0.7491788864135742,
"learning_rate": 7.574007692045928e-05,
"loss": 0.288,
"step": 2890
},
{
"epoch": 5.794205794205794,
"grad_norm": 0.7962749004364014,
"learning_rate": 7.513101128351454e-05,
"loss": 0.2888,
"step": 2900
},
{
"epoch": 5.814185814185814,
"grad_norm": 0.7898345589637756,
"learning_rate": 7.45229274316618e-05,
"loss": 0.2875,
"step": 2910
},
{
"epoch": 5.834165834165834,
"grad_norm": 0.7886426448822021,
"learning_rate": 7.391584937101033e-05,
"loss": 0.2947,
"step": 2920
},
{
"epoch": 5.854145854145854,
"grad_norm": 0.7488512396812439,
"learning_rate": 7.330980106796246e-05,
"loss": 0.2846,
"step": 2930
},
{
"epoch": 5.874125874125874,
"grad_norm": 0.7348522543907166,
"learning_rate": 7.270480644826749e-05,
"loss": 0.2883,
"step": 2940
},
{
"epoch": 5.894105894105894,
"grad_norm": 0.7618998885154724,
"learning_rate": 7.210088939607708e-05,
"loss": 0.2899,
"step": 2950
},
{
"epoch": 5.9140859140859146,
"grad_norm": 0.78291255235672,
"learning_rate": 7.149807375300239e-05,
"loss": 0.2865,
"step": 2960
},
{
"epoch": 5.934065934065934,
"grad_norm": 0.7446394562721252,
"learning_rate": 7.089638331717284e-05,
"loss": 0.2846,
"step": 2970
},
{
"epoch": 5.954045954045954,
"grad_norm": 0.767301619052887,
"learning_rate": 7.029584184229653e-05,
"loss": 0.2887,
"step": 2980
},
{
"epoch": 5.974025974025974,
"grad_norm": 0.7523135542869568,
"learning_rate": 6.969647303672262e-05,
"loss": 0.2873,
"step": 2990
},
{
"epoch": 5.9940059940059935,
"grad_norm": 0.7532919049263,
"learning_rate": 6.909830056250527e-05,
"loss": 0.2882,
"step": 3000
},
{
"epoch": 6.013986013986014,
"grad_norm": 0.6552711129188538,
"learning_rate": 6.850134803446954e-05,
"loss": 0.2488,
"step": 3010
},
{
"epoch": 6.033966033966034,
"grad_norm": 0.6565443873405457,
"learning_rate": 6.790563901927907e-05,
"loss": 0.2345,
"step": 3020
},
{
"epoch": 6.053946053946054,
"grad_norm": 0.6884881854057312,
"learning_rate": 6.731119703450577e-05,
"loss": 0.233,
"step": 3030
},
{
"epoch": 6.073926073926074,
"grad_norm": 0.6287186741828918,
"learning_rate": 6.671804554770135e-05,
"loss": 0.2356,
"step": 3040
},
{
"epoch": 6.093906093906094,
"grad_norm": 0.754036545753479,
"learning_rate": 6.612620797547087e-05,
"loss": 0.2352,
"step": 3050
},
{
"epoch": 6.113886113886114,
"grad_norm": 0.6492979526519775,
"learning_rate": 6.55357076825483e-05,
"loss": 0.2329,
"step": 3060
},
{
"epoch": 6.1338661338661336,
"grad_norm": 0.6303039789199829,
"learning_rate": 6.494656798087412e-05,
"loss": 0.2339,
"step": 3070
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.6423007845878601,
"learning_rate": 6.435881212867493e-05,
"loss": 0.2377,
"step": 3080
},
{
"epoch": 6.173826173826174,
"grad_norm": 0.6716975569725037,
"learning_rate": 6.377246332954544e-05,
"loss": 0.2365,
"step": 3090
},
{
"epoch": 6.193806193806194,
"grad_norm": 0.6927747130393982,
"learning_rate": 6.318754473153221e-05,
"loss": 0.2346,
"step": 3100
},
{
"epoch": 6.213786213786213,
"grad_norm": 0.6551555395126343,
"learning_rate": 6.260407942621998e-05,
"loss": 0.235,
"step": 3110
},
{
"epoch": 6.233766233766234,
"grad_norm": 0.7131916284561157,
"learning_rate": 6.20220904478199e-05,
"loss": 0.2401,
"step": 3120
},
{
"epoch": 6.253746253746254,
"grad_norm": 0.7002174258232117,
"learning_rate": 6.144160077226036e-05,
"loss": 0.2398,
"step": 3130
},
{
"epoch": 6.273726273726274,
"grad_norm": 0.7129354476928711,
"learning_rate": 6.086263331627976e-05,
"loss": 0.2401,
"step": 3140
},
{
"epoch": 6.293706293706293,
"grad_norm": 0.6942778825759888,
"learning_rate": 6.0285210936521955e-05,
"loss": 0.2391,
"step": 3150
},
{
"epoch": 6.313686313686314,
"grad_norm": 0.7181575298309326,
"learning_rate": 5.9709356428633746e-05,
"loss": 0.2434,
"step": 3160
},
{
"epoch": 6.333666333666334,
"grad_norm": 0.720330536365509,
"learning_rate": 5.913509252636511e-05,
"loss": 0.2352,
"step": 3170
},
{
"epoch": 6.353646353646353,
"grad_norm": 0.6518005728721619,
"learning_rate": 5.856244190067159e-05,
"loss": 0.2377,
"step": 3180
},
{
"epoch": 6.373626373626374,
"grad_norm": 0.6705808639526367,
"learning_rate": 5.799142715881938e-05,
"loss": 0.2416,
"step": 3190
},
{
"epoch": 6.393606393606394,
"grad_norm": 0.7210578322410583,
"learning_rate": 5.7422070843492734e-05,
"loss": 0.2406,
"step": 3200
},
{
"epoch": 6.413586413586414,
"grad_norm": 0.6428204774856567,
"learning_rate": 5.6854395431904094e-05,
"loss": 0.2397,
"step": 3210
},
{
"epoch": 6.433566433566433,
"grad_norm": 0.697733461856842,
"learning_rate": 5.6288423334906735e-05,
"loss": 0.2425,
"step": 3220
},
{
"epoch": 6.453546453546454,
"grad_norm": 0.7867773175239563,
"learning_rate": 5.572417689610987e-05,
"loss": 0.2401,
"step": 3230
},
{
"epoch": 6.473526473526474,
"grad_norm": 0.6750375032424927,
"learning_rate": 5.5161678390996796e-05,
"loss": 0.2396,
"step": 3240
},
{
"epoch": 6.4935064935064934,
"grad_norm": 0.677237868309021,
"learning_rate": 5.4600950026045326e-05,
"loss": 0.2434,
"step": 3250
},
{
"epoch": 6.513486513486513,
"grad_norm": 0.6781632304191589,
"learning_rate": 5.404201393785122e-05,
"loss": 0.2454,
"step": 3260
},
{
"epoch": 6.533466533466534,
"grad_norm": 0.7506418824195862,
"learning_rate": 5.348489219225416e-05,
"loss": 0.2397,
"step": 3270
},
{
"epoch": 6.553446553446554,
"grad_norm": 0.7256707549095154,
"learning_rate": 5.292960678346675e-05,
"loss": 0.2403,
"step": 3280
},
{
"epoch": 6.573426573426573,
"grad_norm": 0.664169430732727,
"learning_rate": 5.237617963320608e-05,
"loss": 0.2392,
"step": 3290
},
{
"epoch": 6.593406593406593,
"grad_norm": 0.7900999188423157,
"learning_rate": 5.182463258982846e-05,
"loss": 0.2426,
"step": 3300
},
{
"epoch": 6.613386613386614,
"grad_norm": 0.7012047171592712,
"learning_rate": 5.127498742746675e-05,
"loss": 0.2429,
"step": 3310
},
{
"epoch": 6.6333666333666335,
"grad_norm": 0.752498984336853,
"learning_rate": 5.072726584517086e-05,
"loss": 0.2425,
"step": 3320
},
{
"epoch": 6.653346653346653,
"grad_norm": 0.7256404161453247,
"learning_rate": 5.018148946605092e-05,
"loss": 0.2381,
"step": 3330
},
{
"epoch": 6.673326673326673,
"grad_norm": 0.6938993334770203,
"learning_rate": 4.9637679836423924e-05,
"loss": 0.2428,
"step": 3340
},
{
"epoch": 6.693306693306694,
"grad_norm": 0.7288166284561157,
"learning_rate": 4.909585842496287e-05,
"loss": 0.2409,
"step": 3350
},
{
"epoch": 6.713286713286713,
"grad_norm": 0.7148503661155701,
"learning_rate": 4.8556046621849346e-05,
"loss": 0.2402,
"step": 3360
},
{
"epoch": 6.733266733266733,
"grad_norm": 0.7477458715438843,
"learning_rate": 4.8018265737929044e-05,
"loss": 0.2394,
"step": 3370
},
{
"epoch": 6.753246753246753,
"grad_norm": 0.7404049634933472,
"learning_rate": 4.748253700387042e-05,
"loss": 0.2422,
"step": 3380
},
{
"epoch": 6.7732267732267735,
"grad_norm": 0.6715726852416992,
"learning_rate": 4.694888156932658e-05,
"loss": 0.2405,
"step": 3390
},
{
"epoch": 6.793206793206793,
"grad_norm": 0.6998412609100342,
"learning_rate": 4.6417320502100316e-05,
"loss": 0.2405,
"step": 3400
},
{
"epoch": 6.813186813186813,
"grad_norm": 0.7061425447463989,
"learning_rate": 4.588787478731242e-05,
"loss": 0.2368,
"step": 3410
},
{
"epoch": 6.833166833166834,
"grad_norm": 0.7432896494865417,
"learning_rate": 4.5360565326573104e-05,
"loss": 0.2399,
"step": 3420
},
{
"epoch": 6.853146853146853,
"grad_norm": 0.7876798510551453,
"learning_rate": 4.483541293715698e-05,
"loss": 0.2395,
"step": 3430
},
{
"epoch": 6.873126873126873,
"grad_norm": 0.7446125149726868,
"learning_rate": 4.431243835118124e-05,
"loss": 0.241,
"step": 3440
},
{
"epoch": 6.893106893106893,
"grad_norm": 0.6832261085510254,
"learning_rate": 4.379166221478697e-05,
"loss": 0.2396,
"step": 3450
},
{
"epoch": 6.913086913086913,
"grad_norm": 0.7039461135864258,
"learning_rate": 4.327310508732437e-05,
"loss": 0.2408,
"step": 3460
},
{
"epoch": 6.933066933066933,
"grad_norm": 0.7428474426269531,
"learning_rate": 4.2756787440540936e-05,
"loss": 0.2407,
"step": 3470
},
{
"epoch": 6.953046953046953,
"grad_norm": 0.7313565015792847,
"learning_rate": 4.224272965777326e-05,
"loss": 0.2406,
"step": 3480
},
{
"epoch": 6.973026973026973,
"grad_norm": 0.7175894975662231,
"learning_rate": 4.173095203314241e-05,
"loss": 0.2409,
"step": 3490
},
{
"epoch": 6.993006993006993,
"grad_norm": 0.6897133588790894,
"learning_rate": 4.12214747707527e-05,
"loss": 0.2389,
"step": 3500
},
{
"epoch": 7.012987012987013,
"grad_norm": 0.5959777235984802,
"learning_rate": 4.071431798389408e-05,
"loss": 0.2184,
"step": 3510
},
{
"epoch": 7.032967032967033,
"grad_norm": 0.7147582173347473,
"learning_rate": 4.020950169424815e-05,
"loss": 0.2087,
"step": 3520
},
{
"epoch": 7.052947052947053,
"grad_norm": 0.6122413873672485,
"learning_rate": 3.9707045831097555e-05,
"loss": 0.2106,
"step": 3530
},
{
"epoch": 7.072927072927073,
"grad_norm": 0.633969783782959,
"learning_rate": 3.920697023053949e-05,
"loss": 0.2099,
"step": 3540
},
{
"epoch": 7.092907092907093,
"grad_norm": 0.6842843890190125,
"learning_rate": 3.8709294634702376e-05,
"loss": 0.2104,
"step": 3550
},
{
"epoch": 7.112887112887113,
"grad_norm": 0.5708280205726624,
"learning_rate": 3.821403869096658e-05,
"loss": 0.2125,
"step": 3560
},
{
"epoch": 7.1328671328671325,
"grad_norm": 0.6579930782318115,
"learning_rate": 3.7721221951188765e-05,
"loss": 0.2107,
"step": 3570
},
{
"epoch": 7.152847152847153,
"grad_norm": 0.5980693101882935,
"learning_rate": 3.7230863870929964e-05,
"loss": 0.2085,
"step": 3580
},
{
"epoch": 7.172827172827173,
"grad_norm": 0.5968551635742188,
"learning_rate": 3.674298380868756e-05,
"loss": 0.209,
"step": 3590
},
{
"epoch": 7.192807192807193,
"grad_norm": 0.6218951940536499,
"learning_rate": 3.6257601025131026e-05,
"loss": 0.2095,
"step": 3600
},
{
"epoch": 7.212787212787212,
"grad_norm": 0.6248393058776855,
"learning_rate": 3.577473468234156e-05,
"loss": 0.2155,
"step": 3610
},
{
"epoch": 7.232767232767233,
"grad_norm": 0.6496105194091797,
"learning_rate": 3.52944038430556e-05,
"loss": 0.2139,
"step": 3620
},
{
"epoch": 7.252747252747253,
"grad_norm": 0.6064103841781616,
"learning_rate": 3.481662746991214e-05,
"loss": 0.2081,
"step": 3630
},
{
"epoch": 7.2727272727272725,
"grad_norm": 0.6504641771316528,
"learning_rate": 3.4341424424704375e-05,
"loss": 0.2111,
"step": 3640
},
{
"epoch": 7.292707292707293,
"grad_norm": 0.6580168604850769,
"learning_rate": 3.386881346763483e-05,
"loss": 0.2123,
"step": 3650
},
{
"epoch": 7.312687312687313,
"grad_norm": 0.5861549973487854,
"learning_rate": 3.339881325657484e-05,
"loss": 0.2084,
"step": 3660
},
{
"epoch": 7.332667332667333,
"grad_norm": 0.6313382387161255,
"learning_rate": 3.2931442346328004e-05,
"loss": 0.2078,
"step": 3670
},
{
"epoch": 7.352647352647352,
"grad_norm": 0.646842896938324,
"learning_rate": 3.246671918789755e-05,
"loss": 0.2135,
"step": 3680
},
{
"epoch": 7.372627372627373,
"grad_norm": 0.6964268088340759,
"learning_rate": 3.200466212775808e-05,
"loss": 0.2126,
"step": 3690
},
{
"epoch": 7.392607392607393,
"grad_norm": 0.6139673590660095,
"learning_rate": 3.154528940713113e-05,
"loss": 0.215,
"step": 3700
},
{
"epoch": 7.4125874125874125,
"grad_norm": 0.6455628871917725,
"learning_rate": 3.108861916126518e-05,
"loss": 0.2114,
"step": 3710
},
{
"epoch": 7.432567432567432,
"grad_norm": 0.6227108240127563,
"learning_rate": 3.063466941871952e-05,
"loss": 0.2114,
"step": 3720
},
{
"epoch": 7.452547452547453,
"grad_norm": 0.5858675837516785,
"learning_rate": 3.018345810065275e-05,
"loss": 0.2107,
"step": 3730
},
{
"epoch": 7.472527472527473,
"grad_norm": 0.6218124628067017,
"learning_rate": 2.9735003020115092e-05,
"loss": 0.2115,
"step": 3740
},
{
"epoch": 7.492507492507492,
"grad_norm": 0.6510396003723145,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.2124,
"step": 3750
},
{
"epoch": 7.512487512487512,
"grad_norm": 0.6465820074081421,
"learning_rate": 2.8846432279071467e-05,
"loss": 0.2132,
"step": 3760
},
{
"epoch": 7.532467532467533,
"grad_norm": 0.7002317905426025,
"learning_rate": 2.840635169781688e-05,
"loss": 0.2129,
"step": 3770
},
{
"epoch": 7.5524475524475525,
"grad_norm": 0.647723913192749,
"learning_rate": 2.7969097511209308e-05,
"loss": 0.2136,
"step": 3780
},
{
"epoch": 7.572427572427572,
"grad_norm": 0.5907153487205505,
"learning_rate": 2.753468698129533e-05,
"loss": 0.2115,
"step": 3790
},
{
"epoch": 7.592407592407593,
"grad_norm": 0.6074231863021851,
"learning_rate": 2.7103137257858868e-05,
"loss": 0.2128,
"step": 3800
},
{
"epoch": 7.612387612387613,
"grad_norm": 0.6356890797615051,
"learning_rate": 2.6674465377744017e-05,
"loss": 0.2108,
"step": 3810
},
{
"epoch": 7.632367632367632,
"grad_norm": 0.6739248633384705,
"learning_rate": 2.624868826418262e-05,
"loss": 0.2129,
"step": 3820
},
{
"epoch": 7.652347652347652,
"grad_norm": 0.6241906881332397,
"learning_rate": 2.582582272612609e-05,
"loss": 0.211,
"step": 3830
},
{
"epoch": 7.672327672327672,
"grad_norm": 0.6532058715820312,
"learning_rate": 2.540588545758179e-05,
"loss": 0.2137,
"step": 3840
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.7098828554153442,
"learning_rate": 2.4988893036954043e-05,
"loss": 0.2105,
"step": 3850
},
{
"epoch": 7.712287712287712,
"grad_norm": 0.6868453025817871,
"learning_rate": 2.4574861926389615e-05,
"loss": 0.214,
"step": 3860
},
{
"epoch": 7.732267732267732,
"grad_norm": 0.6777834296226501,
"learning_rate": 2.4163808471127812e-05,
"loss": 0.2125,
"step": 3870
},
{
"epoch": 7.752247752247753,
"grad_norm": 0.6967138648033142,
"learning_rate": 2.37557488988552e-05,
"loss": 0.2118,
"step": 3880
},
{
"epoch": 7.772227772227772,
"grad_norm": 0.6641217470169067,
"learning_rate": 2.3350699319065026e-05,
"loss": 0.2134,
"step": 3890
},
{
"epoch": 7.792207792207792,
"grad_norm": 0.6727011799812317,
"learning_rate": 2.2948675722421086e-05,
"loss": 0.217,
"step": 3900
},
{
"epoch": 7.812187812187812,
"grad_norm": 0.6331846117973328,
"learning_rate": 2.254969398012663e-05,
"loss": 0.2127,
"step": 3910
},
{
"epoch": 7.8321678321678325,
"grad_norm": 0.6486308574676514,
"learning_rate": 2.2153769843297667e-05,
"loss": 0.2096,
"step": 3920
},
{
"epoch": 7.852147852147852,
"grad_norm": 0.6658995151519775,
"learning_rate": 2.1760918942341192e-05,
"loss": 0.211,
"step": 3930
},
{
"epoch": 7.872127872127872,
"grad_norm": 0.687493085861206,
"learning_rate": 2.137115678633811e-05,
"loss": 0.2163,
"step": 3940
},
{
"epoch": 7.892107892107892,
"grad_norm": 0.6267641186714172,
"learning_rate": 2.098449876243096e-05,
"loss": 0.2142,
"step": 3950
},
{
"epoch": 7.912087912087912,
"grad_norm": 0.6141098141670227,
"learning_rate": 2.0600960135216462e-05,
"loss": 0.2134,
"step": 3960
},
{
"epoch": 7.932067932067932,
"grad_norm": 0.6436827182769775,
"learning_rate": 2.0220556046142893e-05,
"loss": 0.214,
"step": 3970
},
{
"epoch": 7.952047952047952,
"grad_norm": 0.6543010473251343,
"learning_rate": 1.9843301512912327e-05,
"loss": 0.2126,
"step": 3980
},
{
"epoch": 7.972027972027972,
"grad_norm": 0.6083731651306152,
"learning_rate": 1.946921142888781e-05,
"loss": 0.2135,
"step": 3990
},
{
"epoch": 7.992007992007992,
"grad_norm": 0.6408571600914001,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.2123,
"step": 4000
},
{
"epoch": 8.011988011988011,
"grad_norm": 0.556982159614563,
"learning_rate": 1.8730583556690605e-05,
"loss": 0.2042,
"step": 4010
},
{
"epoch": 8.031968031968033,
"grad_norm": 0.5726343393325806,
"learning_rate": 1.8366074928281607e-05,
"loss": 0.1941,
"step": 4020
},
{
"epoch": 8.051948051948052,
"grad_norm": 0.5825814604759216,
"learning_rate": 1.8004789067454764e-05,
"loss": 0.1976,
"step": 4030
},
{
"epoch": 8.071928071928072,
"grad_norm": 0.569325864315033,
"learning_rate": 1.7646740237157256e-05,
"loss": 0.196,
"step": 4040
},
{
"epoch": 8.091908091908092,
"grad_norm": 0.5917354226112366,
"learning_rate": 1.7291942572543807e-05,
"loss": 0.195,
"step": 4050
},
{
"epoch": 8.111888111888112,
"grad_norm": 0.5817933678627014,
"learning_rate": 1.6940410080418723e-05,
"loss": 0.1971,
"step": 4060
},
{
"epoch": 8.131868131868131,
"grad_norm": 0.6475218534469604,
"learning_rate": 1.6592156638682886e-05,
"loss": 0.197,
"step": 4070
},
{
"epoch": 8.151848151848151,
"grad_norm": 0.6248770356178284,
"learning_rate": 1.6247195995785837e-05,
"loss": 0.1971,
"step": 4080
},
{
"epoch": 8.171828171828173,
"grad_norm": 0.5749895572662354,
"learning_rate": 1.5905541770183096e-05,
"loss": 0.1964,
"step": 4090
},
{
"epoch": 8.191808191808192,
"grad_norm": 0.6148300766944885,
"learning_rate": 1.5567207449798515e-05,
"loss": 0.1966,
"step": 4100
},
{
"epoch": 8.211788211788212,
"grad_norm": 0.6778724789619446,
"learning_rate": 1.5232206391491699e-05,
"loss": 0.1955,
"step": 4110
},
{
"epoch": 8.231768231768232,
"grad_norm": 0.5883269906044006,
"learning_rate": 1.4900551820530828e-05,
"loss": 0.1919,
"step": 4120
},
{
"epoch": 8.251748251748252,
"grad_norm": 0.567950963973999,
"learning_rate": 1.4572256830070497e-05,
"loss": 0.1966,
"step": 4130
},
{
"epoch": 8.271728271728271,
"grad_norm": 0.5733300447463989,
"learning_rate": 1.4247334380634792e-05,
"loss": 0.1964,
"step": 4140
},
{
"epoch": 8.291708291708291,
"grad_norm": 0.638990044593811,
"learning_rate": 1.3925797299605647e-05,
"loss": 0.1944,
"step": 4150
},
{
"epoch": 8.311688311688311,
"grad_norm": 0.6272343397140503,
"learning_rate": 1.3607658280716473e-05,
"loss": 0.1951,
"step": 4160
},
{
"epoch": 8.331668331668332,
"grad_norm": 0.5631300210952759,
"learning_rate": 1.3292929883550998e-05,
"loss": 0.1983,
"step": 4170
},
{
"epoch": 8.351648351648352,
"grad_norm": 0.6056917309761047,
"learning_rate": 1.2981624533047432e-05,
"loss": 0.1976,
"step": 4180
},
{
"epoch": 8.371628371628372,
"grad_norm": 0.6021771430969238,
"learning_rate": 1.2673754519008008e-05,
"loss": 0.1968,
"step": 4190
},
{
"epoch": 8.391608391608392,
"grad_norm": 0.5835386514663696,
"learning_rate": 1.2369331995613665e-05,
"loss": 0.1977,
"step": 4200
},
{
"epoch": 8.411588411588411,
"grad_norm": 0.5700567960739136,
"learning_rate": 1.206836898094439e-05,
"loss": 0.1992,
"step": 4210
},
{
"epoch": 8.431568431568431,
"grad_norm": 0.6391722559928894,
"learning_rate": 1.1770877356504683e-05,
"loss": 0.1977,
"step": 4220
},
{
"epoch": 8.451548451548451,
"grad_norm": 0.5633198022842407,
"learning_rate": 1.1476868866754486e-05,
"loss": 0.1975,
"step": 4230
},
{
"epoch": 8.471528471528472,
"grad_norm": 0.6308007836341858,
"learning_rate": 1.1186355118645554e-05,
"loss": 0.2002,
"step": 4240
},
{
"epoch": 8.491508491508492,
"grad_norm": 0.6147842407226562,
"learning_rate": 1.0899347581163221e-05,
"loss": 0.199,
"step": 4250
},
{
"epoch": 8.511488511488512,
"grad_norm": 0.6099655628204346,
"learning_rate": 1.0615857584873623e-05,
"loss": 0.1971,
"step": 4260
},
{
"epoch": 8.531468531468532,
"grad_norm": 0.6306450366973877,
"learning_rate": 1.0335896321476413e-05,
"loss": 0.1971,
"step": 4270
},
{
"epoch": 8.551448551448551,
"grad_norm": 0.5740554928779602,
"learning_rate": 1.0059474843362892e-05,
"loss": 0.1964,
"step": 4280
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.566005289554596,
"learning_rate": 9.786604063179728e-06,
"loss": 0.197,
"step": 4290
},
{
"epoch": 8.591408591408591,
"grad_norm": 0.6008467674255371,
"learning_rate": 9.517294753398064e-06,
"loss": 0.1969,
"step": 4300
},
{
"epoch": 8.61138861138861,
"grad_norm": 0.5880402326583862,
"learning_rate": 9.251557545888312e-06,
"loss": 0.1944,
"step": 4310
},
{
"epoch": 8.631368631368632,
"grad_norm": 0.6250616908073425,
"learning_rate": 8.989402931500434e-06,
"loss": 0.1978,
"step": 4320
},
{
"epoch": 8.651348651348652,
"grad_norm": 0.554460883140564,
"learning_rate": 8.730841259649725e-06,
"loss": 0.1998,
"step": 4330
},
{
"epoch": 8.671328671328672,
"grad_norm": 0.5680242776870728,
"learning_rate": 8.475882737908248e-06,
"loss": 0.2,
"step": 4340
},
{
"epoch": 8.691308691308691,
"grad_norm": 0.5889159440994263,
"learning_rate": 8.224537431601886e-06,
"loss": 0.1985,
"step": 4350
},
{
"epoch": 8.711288711288711,
"grad_norm": 0.6051207780838013,
"learning_rate": 7.976815263412963e-06,
"loss": 0.1944,
"step": 4360
},
{
"epoch": 8.731268731268731,
"grad_norm": 0.6148102283477783,
"learning_rate": 7.73272601298851e-06,
"loss": 0.1952,
"step": 4370
},
{
"epoch": 8.75124875124875,
"grad_norm": 0.6123753786087036,
"learning_rate": 7.492279316554207e-06,
"loss": 0.1955,
"step": 4380
},
{
"epoch": 8.77122877122877,
"grad_norm": 0.5911871790885925,
"learning_rate": 7.255484666533874e-06,
"loss": 0.1987,
"step": 4390
},
{
"epoch": 8.791208791208792,
"grad_norm": 0.5861064195632935,
"learning_rate": 7.022351411174866e-06,
"loss": 0.1972,
"step": 4400
},
{
"epoch": 8.811188811188812,
"grad_norm": 0.6565813422203064,
"learning_rate": 6.7928887541789055e-06,
"loss": 0.1966,
"step": 4410
},
{
"epoch": 8.831168831168831,
"grad_norm": 0.6338573694229126,
"learning_rate": 6.5671057543387985e-06,
"loss": 0.1987,
"step": 4420
},
{
"epoch": 8.851148851148851,
"grad_norm": 0.5672295093536377,
"learning_rate": 6.345011325180772e-06,
"loss": 0.198,
"step": 4430
},
{
"epoch": 8.871128871128871,
"grad_norm": 0.6036155223846436,
"learning_rate": 6.126614234612593e-06,
"loss": 0.199,
"step": 4440
},
{
"epoch": 8.89110889110889,
"grad_norm": 0.5816395878791809,
"learning_rate": 5.911923104577455e-06,
"loss": 0.1985,
"step": 4450
},
{
"epoch": 8.91108891108891,
"grad_norm": 0.5562584400177002,
"learning_rate": 5.700946410713548e-06,
"loss": 0.1964,
"step": 4460
},
{
"epoch": 8.931068931068932,
"grad_norm": 0.6179762482643127,
"learning_rate": 5.49369248201953e-06,
"loss": 0.1948,
"step": 4470
},
{
"epoch": 8.951048951048952,
"grad_norm": 0.5566456317901611,
"learning_rate": 5.290169500525577e-06,
"loss": 0.1958,
"step": 4480
},
{
"epoch": 8.971028971028971,
"grad_norm": 0.6196462512016296,
"learning_rate": 5.0903855009705514e-06,
"loss": 0.1978,
"step": 4490
},
{
"epoch": 8.991008991008991,
"grad_norm": 0.5933112502098083,
"learning_rate": 4.8943483704846475e-06,
"loss": 0.1962,
"step": 4500
},
{
"epoch": 9.010989010989011,
"grad_norm": 0.5680419206619263,
"learning_rate": 4.702065848278126e-06,
"loss": 0.1948,
"step": 4510
},
{
"epoch": 9.03096903096903,
"grad_norm": 0.5447672605514526,
"learning_rate": 4.513545525335705e-06,
"loss": 0.1894,
"step": 4520
},
{
"epoch": 9.05094905094905,
"grad_norm": 0.5605758428573608,
"learning_rate": 4.328794844116946e-06,
"loss": 0.1903,
"step": 4530
},
{
"epoch": 9.07092907092907,
"grad_norm": 0.5727641582489014,
"learning_rate": 4.147821098262405e-06,
"loss": 0.1899,
"step": 4540
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.5076532363891602,
"learning_rate": 3.970631432305694e-06,
"loss": 0.1872,
"step": 4550
},
{
"epoch": 9.110889110889111,
"grad_norm": 0.5827686190605164,
"learning_rate": 3.797232841391407e-06,
"loss": 0.1871,
"step": 4560
},
{
"epoch": 9.130869130869131,
"grad_norm": 0.5457426905632019,
"learning_rate": 3.627632170999029e-06,
"loss": 0.1903,
"step": 4570
},
{
"epoch": 9.150849150849151,
"grad_norm": 0.5931391716003418,
"learning_rate": 3.461836116672612e-06,
"loss": 0.1935,
"step": 4580
},
{
"epoch": 9.17082917082917,
"grad_norm": 0.5335982441902161,
"learning_rate": 3.2998512237565005e-06,
"loss": 0.188,
"step": 4590
},
{
"epoch": 9.19080919080919,
"grad_norm": 0.5809586048126221,
"learning_rate": 3.1416838871368924e-06,
"loss": 0.1882,
"step": 4600
},
{
"epoch": 9.21078921078921,
"grad_norm": 0.5997488498687744,
"learning_rate": 2.9873403509894203e-06,
"loss": 0.189,
"step": 4610
},
{
"epoch": 9.23076923076923,
"grad_norm": 0.5423487424850464,
"learning_rate": 2.836826708532603e-06,
"loss": 0.1916,
"step": 4620
},
{
"epoch": 9.250749250749251,
"grad_norm": 0.5920736193656921,
"learning_rate": 2.690148901787337e-06,
"loss": 0.1914,
"step": 4630
},
{
"epoch": 9.270729270729271,
"grad_norm": 0.5774621367454529,
"learning_rate": 2.5473127213422763e-06,
"loss": 0.1901,
"step": 4640
},
{
"epoch": 9.290709290709291,
"grad_norm": 0.6183256506919861,
"learning_rate": 2.4083238061252567e-06,
"loss": 0.1918,
"step": 4650
},
{
"epoch": 9.31068931068931,
"grad_norm": 0.5502414107322693,
"learning_rate": 2.273187643180652e-06,
"loss": 0.1888,
"step": 4660
},
{
"epoch": 9.33066933066933,
"grad_norm": 0.5888564586639404,
"learning_rate": 2.141909567452793e-06,
"loss": 0.189,
"step": 4670
},
{
"epoch": 9.35064935064935,
"grad_norm": 0.582281231880188,
"learning_rate": 2.014494761575314e-06,
"loss": 0.188,
"step": 4680
},
{
"epoch": 9.37062937062937,
"grad_norm": 0.549766480922699,
"learning_rate": 1.8909482556666024e-06,
"loss": 0.1911,
"step": 4690
},
{
"epoch": 9.390609390609391,
"grad_norm": 0.6442523002624512,
"learning_rate": 1.771274927131139e-06,
"loss": 0.1913,
"step": 4700
},
{
"epoch": 9.410589410589411,
"grad_norm": 0.5612021684646606,
"learning_rate": 1.6554795004670388e-06,
"loss": 0.1926,
"step": 4710
},
{
"epoch": 9.430569430569431,
"grad_norm": 0.6060473918914795,
"learning_rate": 1.543566547079467e-06,
"loss": 0.19,
"step": 4720
},
{
"epoch": 9.45054945054945,
"grad_norm": 0.5958064794540405,
"learning_rate": 1.4355404851001952e-06,
"loss": 0.1885,
"step": 4730
},
{
"epoch": 9.47052947052947,
"grad_norm": 0.536431610584259,
"learning_rate": 1.3314055792131964e-06,
"loss": 0.1891,
"step": 4740
},
{
"epoch": 9.49050949050949,
"grad_norm": 0.5971366763114929,
"learning_rate": 1.231165940486234e-06,
"loss": 0.1889,
"step": 4750
},
{
"epoch": 9.51048951048951,
"grad_norm": 0.5461220145225525,
"learning_rate": 1.134825526208605e-06,
"loss": 0.1874,
"step": 4760
},
{
"epoch": 9.53046953046953,
"grad_norm": 0.570928156375885,
"learning_rate": 1.0423881397349068e-06,
"loss": 0.1884,
"step": 4770
},
{
"epoch": 9.550449550449551,
"grad_norm": 0.5855159759521484,
"learning_rate": 9.538574303348813e-07,
"loss": 0.1895,
"step": 4780
},
{
"epoch": 9.570429570429571,
"grad_norm": 0.5505802631378174,
"learning_rate": 8.692368930493521e-07,
"loss": 0.1904,
"step": 4790
},
{
"epoch": 9.59040959040959,
"grad_norm": 0.5663396716117859,
"learning_rate": 7.885298685522235e-07,
"loss": 0.1909,
"step": 4800
},
{
"epoch": 9.61038961038961,
"grad_norm": 0.6069871783256531,
"learning_rate": 7.117395430186414e-07,
"loss": 0.1895,
"step": 4810
},
{
"epoch": 9.63036963036963,
"grad_norm": 0.5576395988464355,
"learning_rate": 6.388689479991605e-07,
"loss": 0.1906,
"step": 4820
},
{
"epoch": 9.65034965034965,
"grad_norm": 0.5069971084594727,
"learning_rate": 5.699209603001076e-07,
"loss": 0.1889,
"step": 4830
},
{
"epoch": 9.67032967032967,
"grad_norm": 0.5770872235298157,
"learning_rate": 5.048983018699827e-07,
"loss": 0.1907,
"step": 4840
},
{
"epoch": 9.69030969030969,
"grad_norm": 0.6914857029914856,
"learning_rate": 4.438035396920004e-07,
"loss": 0.1939,
"step": 4850
},
{
"epoch": 9.710289710289711,
"grad_norm": 0.5999007821083069,
"learning_rate": 3.866390856827495e-07,
"loss": 0.1924,
"step": 4860
},
{
"epoch": 9.73026973026973,
"grad_norm": 0.569180965423584,
"learning_rate": 3.3340719659701313e-07,
"loss": 0.1887,
"step": 4870
},
{
"epoch": 9.75024975024975,
"grad_norm": 0.5442143082618713,
"learning_rate": 2.841099739386066e-07,
"loss": 0.1897,
"step": 4880
},
{
"epoch": 9.77022977022977,
"grad_norm": 0.5622804164886475,
"learning_rate": 2.387493638774774e-07,
"loss": 0.1898,
"step": 4890
},
{
"epoch": 9.79020979020979,
"grad_norm": 0.6558981537818909,
"learning_rate": 1.973271571728441e-07,
"loss": 0.1917,
"step": 4900
},
{
"epoch": 9.81018981018981,
"grad_norm": 0.5756235122680664,
"learning_rate": 1.598449891024978e-07,
"loss": 0.192,
"step": 4910
},
{
"epoch": 9.83016983016983,
"grad_norm": 0.5818027257919312,
"learning_rate": 1.2630433939825327e-07,
"loss": 0.1899,
"step": 4920
},
{
"epoch": 9.850149850149851,
"grad_norm": 0.5986452698707581,
"learning_rate": 9.670653218752934e-08,
"loss": 0.1918,
"step": 4930
},
{
"epoch": 9.87012987012987,
"grad_norm": 0.5438185334205627,
"learning_rate": 7.105273594107953e-08,
"loss": 0.1905,
"step": 4940
},
{
"epoch": 9.89010989010989,
"grad_norm": 0.5430960059165955,
"learning_rate": 4.934396342684e-08,
"loss": 0.1913,
"step": 4950
},
{
"epoch": 9.91008991008991,
"grad_norm": 0.5492510199546814,
"learning_rate": 3.1581071670006015e-08,
"loss": 0.1904,
"step": 4960
},
{
"epoch": 9.93006993006993,
"grad_norm": 0.5370259881019592,
"learning_rate": 1.7764761919103477e-08,
"loss": 0.1901,
"step": 4970
},
{
"epoch": 9.95004995004995,
"grad_norm": 0.5463282465934753,
"learning_rate": 7.895579618388827e-09,
"loss": 0.191,
"step": 4980
},
{
"epoch": 9.97002997002997,
"grad_norm": 0.5733128190040588,
"learning_rate": 1.973914386288467e-09,
"loss": 0.1885,
"step": 4990
},
{
"epoch": 9.99000999000999,
"grad_norm": 0.5241893529891968,
"learning_rate": 0.0,
"loss": 0.1916,
"step": 5000
},
{
"epoch": 9.99000999000999,
"step": 5000,
"total_flos": 7.37720834306605e+17,
"train_loss": 0.4096551623106003,
"train_runtime": 80947.5605,
"train_samples_per_second": 0.742,
"train_steps_per_second": 0.062
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 7.37720834306605e+17,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}