terry69's picture
Model save
76c73fc verified
raw
history blame
65.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1856,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005387931034482759,
"grad_norm": 24.06527582915772,
"learning_rate": 5.376344086021506e-08,
"loss": 1.3568,
"step": 1
},
{
"epoch": 0.0026939655172413795,
"grad_norm": 23.2847675267083,
"learning_rate": 2.688172043010753e-07,
"loss": 1.3668,
"step": 5
},
{
"epoch": 0.005387931034482759,
"grad_norm": 16.195930738756566,
"learning_rate": 5.376344086021506e-07,
"loss": 1.3204,
"step": 10
},
{
"epoch": 0.008081896551724138,
"grad_norm": 12.068298869370592,
"learning_rate": 8.064516129032258e-07,
"loss": 1.153,
"step": 15
},
{
"epoch": 0.010775862068965518,
"grad_norm": 8.564123494535863,
"learning_rate": 1.0752688172043011e-06,
"loss": 1.0452,
"step": 20
},
{
"epoch": 0.013469827586206896,
"grad_norm": 3.533789309391932,
"learning_rate": 1.3440860215053765e-06,
"loss": 0.9515,
"step": 25
},
{
"epoch": 0.016163793103448277,
"grad_norm": 3.24461197562523,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.9001,
"step": 30
},
{
"epoch": 0.018857758620689655,
"grad_norm": 2.990611660406535,
"learning_rate": 1.881720430107527e-06,
"loss": 0.8773,
"step": 35
},
{
"epoch": 0.021551724137931036,
"grad_norm": 3.0063853939062346,
"learning_rate": 2.1505376344086023e-06,
"loss": 0.851,
"step": 40
},
{
"epoch": 0.024245689655172414,
"grad_norm": 2.956366561006899,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.8574,
"step": 45
},
{
"epoch": 0.02693965517241379,
"grad_norm": 2.983398789032246,
"learning_rate": 2.688172043010753e-06,
"loss": 0.84,
"step": 50
},
{
"epoch": 0.029633620689655173,
"grad_norm": 2.964731632227324,
"learning_rate": 2.9569892473118283e-06,
"loss": 0.824,
"step": 55
},
{
"epoch": 0.032327586206896554,
"grad_norm": 2.9208803498660623,
"learning_rate": 3.225806451612903e-06,
"loss": 0.8138,
"step": 60
},
{
"epoch": 0.03502155172413793,
"grad_norm": 3.2063303145455366,
"learning_rate": 3.494623655913979e-06,
"loss": 0.8009,
"step": 65
},
{
"epoch": 0.03771551724137931,
"grad_norm": 3.242653708652505,
"learning_rate": 3.763440860215054e-06,
"loss": 0.792,
"step": 70
},
{
"epoch": 0.04040948275862069,
"grad_norm": 3.1462448663803846,
"learning_rate": 4.032258064516129e-06,
"loss": 0.7902,
"step": 75
},
{
"epoch": 0.04310344827586207,
"grad_norm": 3.0229975986392716,
"learning_rate": 4.3010752688172045e-06,
"loss": 0.7699,
"step": 80
},
{
"epoch": 0.045797413793103446,
"grad_norm": 3.12423094671722,
"learning_rate": 4.56989247311828e-06,
"loss": 0.7644,
"step": 85
},
{
"epoch": 0.04849137931034483,
"grad_norm": 3.2796596768473902,
"learning_rate": 4.838709677419355e-06,
"loss": 0.7712,
"step": 90
},
{
"epoch": 0.05118534482758621,
"grad_norm": 3.0184242042359943,
"learning_rate": 5.1075268817204305e-06,
"loss": 0.7546,
"step": 95
},
{
"epoch": 0.05387931034482758,
"grad_norm": 3.0881392753326447,
"learning_rate": 5.376344086021506e-06,
"loss": 0.7487,
"step": 100
},
{
"epoch": 0.056573275862068964,
"grad_norm": 3.4110841994799657,
"learning_rate": 5.645161290322582e-06,
"loss": 0.7496,
"step": 105
},
{
"epoch": 0.059267241379310345,
"grad_norm": 2.92733810047956,
"learning_rate": 5.9139784946236566e-06,
"loss": 0.7368,
"step": 110
},
{
"epoch": 0.06196120689655173,
"grad_norm": 3.3139008810992046,
"learning_rate": 6.182795698924732e-06,
"loss": 0.7277,
"step": 115
},
{
"epoch": 0.06465517241379311,
"grad_norm": 3.1747479144288455,
"learning_rate": 6.451612903225806e-06,
"loss": 0.7283,
"step": 120
},
{
"epoch": 0.06734913793103449,
"grad_norm": 2.894519107469561,
"learning_rate": 6.720430107526882e-06,
"loss": 0.7282,
"step": 125
},
{
"epoch": 0.07004310344827586,
"grad_norm": 2.8405180587913987,
"learning_rate": 6.989247311827958e-06,
"loss": 0.7123,
"step": 130
},
{
"epoch": 0.07273706896551724,
"grad_norm": 2.7948188759602717,
"learning_rate": 7.258064516129033e-06,
"loss": 0.7193,
"step": 135
},
{
"epoch": 0.07543103448275862,
"grad_norm": 3.154756842274138,
"learning_rate": 7.526881720430108e-06,
"loss": 0.7207,
"step": 140
},
{
"epoch": 0.078125,
"grad_norm": 2.9457108929499207,
"learning_rate": 7.795698924731183e-06,
"loss": 0.7212,
"step": 145
},
{
"epoch": 0.08081896551724138,
"grad_norm": 2.8503644648477517,
"learning_rate": 8.064516129032258e-06,
"loss": 0.72,
"step": 150
},
{
"epoch": 0.08351293103448276,
"grad_norm": 2.949964251276019,
"learning_rate": 8.333333333333334e-06,
"loss": 0.723,
"step": 155
},
{
"epoch": 0.08620689655172414,
"grad_norm": 2.959116036250926,
"learning_rate": 8.602150537634409e-06,
"loss": 0.7158,
"step": 160
},
{
"epoch": 0.08890086206896551,
"grad_norm": 2.7803395603035517,
"learning_rate": 8.870967741935484e-06,
"loss": 0.7067,
"step": 165
},
{
"epoch": 0.09159482758620689,
"grad_norm": 2.8799202670097115,
"learning_rate": 9.13978494623656e-06,
"loss": 0.71,
"step": 170
},
{
"epoch": 0.09428879310344827,
"grad_norm": 2.9537594310040687,
"learning_rate": 9.408602150537635e-06,
"loss": 0.7152,
"step": 175
},
{
"epoch": 0.09698275862068965,
"grad_norm": 2.8628517050727873,
"learning_rate": 9.67741935483871e-06,
"loss": 0.7054,
"step": 180
},
{
"epoch": 0.09967672413793104,
"grad_norm": 2.8896943288351586,
"learning_rate": 9.946236559139786e-06,
"loss": 0.7235,
"step": 185
},
{
"epoch": 0.10237068965517242,
"grad_norm": 2.938518709851193,
"learning_rate": 9.999858445152838e-06,
"loss": 0.7122,
"step": 190
},
{
"epoch": 0.1050646551724138,
"grad_norm": 2.58690085015114,
"learning_rate": 9.999283392323047e-06,
"loss": 0.7061,
"step": 195
},
{
"epoch": 0.10775862068965517,
"grad_norm": 2.763129396160507,
"learning_rate": 9.998266045169356e-06,
"loss": 0.7063,
"step": 200
},
{
"epoch": 0.11045258620689655,
"grad_norm": 2.816275952414151,
"learning_rate": 9.996806493698038e-06,
"loss": 0.7087,
"step": 205
},
{
"epoch": 0.11314655172413793,
"grad_norm": 2.73738463168911,
"learning_rate": 9.994904867037867e-06,
"loss": 0.6986,
"step": 210
},
{
"epoch": 0.11584051724137931,
"grad_norm": 2.810575578616004,
"learning_rate": 9.99256133342869e-06,
"loss": 0.6929,
"step": 215
},
{
"epoch": 0.11853448275862069,
"grad_norm": 2.6652685941669265,
"learning_rate": 9.989776100206547e-06,
"loss": 0.6898,
"step": 220
},
{
"epoch": 0.12122844827586207,
"grad_norm": 2.7660230194471107,
"learning_rate": 9.986549413785323e-06,
"loss": 0.695,
"step": 225
},
{
"epoch": 0.12392241379310345,
"grad_norm": 2.5553942202252466,
"learning_rate": 9.982881559634946e-06,
"loss": 0.7017,
"step": 230
},
{
"epoch": 0.12661637931034483,
"grad_norm": 2.5245345530966192,
"learning_rate": 9.978772862256145e-06,
"loss": 0.6916,
"step": 235
},
{
"epoch": 0.12931034482758622,
"grad_norm": 2.520167957976126,
"learning_rate": 9.97422368515172e-06,
"loss": 0.694,
"step": 240
},
{
"epoch": 0.1320043103448276,
"grad_norm": 2.7125840301494706,
"learning_rate": 9.969234430794395e-06,
"loss": 0.6887,
"step": 245
},
{
"epoch": 0.13469827586206898,
"grad_norm": 2.631424447595556,
"learning_rate": 9.96380554059121e-06,
"loss": 0.685,
"step": 250
},
{
"epoch": 0.13739224137931033,
"grad_norm": 2.555021040773695,
"learning_rate": 9.957937494844472e-06,
"loss": 0.7004,
"step": 255
},
{
"epoch": 0.1400862068965517,
"grad_norm": 2.539978410855113,
"learning_rate": 9.951630812709245e-06,
"loss": 0.6897,
"step": 260
},
{
"epoch": 0.1427801724137931,
"grad_norm": 2.7494174109330842,
"learning_rate": 9.944886052147445e-06,
"loss": 0.6928,
"step": 265
},
{
"epoch": 0.14547413793103448,
"grad_norm": 2.559956756758314,
"learning_rate": 9.937703809878455e-06,
"loss": 0.6813,
"step": 270
},
{
"epoch": 0.14816810344827586,
"grad_norm": 2.525562445581053,
"learning_rate": 9.930084721326342e-06,
"loss": 0.6944,
"step": 275
},
{
"epoch": 0.15086206896551724,
"grad_norm": 2.777619881263396,
"learning_rate": 9.92202946056364e-06,
"loss": 0.6745,
"step": 280
},
{
"epoch": 0.15355603448275862,
"grad_norm": 2.4859789362282076,
"learning_rate": 9.913538740251711e-06,
"loss": 0.6527,
"step": 285
},
{
"epoch": 0.15625,
"grad_norm": 2.4614571056065624,
"learning_rate": 9.904613311577696e-06,
"loss": 0.6673,
"step": 290
},
{
"epoch": 0.15894396551724138,
"grad_norm": 2.503690727361147,
"learning_rate": 9.895253964188056e-06,
"loss": 0.6601,
"step": 295
},
{
"epoch": 0.16163793103448276,
"grad_norm": 2.61491684131174,
"learning_rate": 9.885461526118713e-06,
"loss": 0.6629,
"step": 300
},
{
"epoch": 0.16433189655172414,
"grad_norm": 2.563289578189323,
"learning_rate": 9.875236863721788e-06,
"loss": 0.6834,
"step": 305
},
{
"epoch": 0.16702586206896552,
"grad_norm": 2.542961491155676,
"learning_rate": 9.864580881588958e-06,
"loss": 0.6634,
"step": 310
},
{
"epoch": 0.1697198275862069,
"grad_norm": 2.5998608415854774,
"learning_rate": 9.853494522471423e-06,
"loss": 0.6564,
"step": 315
},
{
"epoch": 0.1724137931034483,
"grad_norm": 2.580998138867243,
"learning_rate": 9.841978767196495e-06,
"loss": 0.6522,
"step": 320
},
{
"epoch": 0.17510775862068967,
"grad_norm": 2.462022076166109,
"learning_rate": 9.830034634580833e-06,
"loss": 0.6575,
"step": 325
},
{
"epoch": 0.17780172413793102,
"grad_norm": 2.641866987114795,
"learning_rate": 9.8176631813403e-06,
"loss": 0.6654,
"step": 330
},
{
"epoch": 0.1804956896551724,
"grad_norm": 2.483224928563204,
"learning_rate": 9.804865501996472e-06,
"loss": 0.6687,
"step": 335
},
{
"epoch": 0.18318965517241378,
"grad_norm": 2.6158710388060755,
"learning_rate": 9.79164272877981e-06,
"loss": 0.6606,
"step": 340
},
{
"epoch": 0.18588362068965517,
"grad_norm": 2.6690109052148396,
"learning_rate": 9.777996031529486e-06,
"loss": 0.6587,
"step": 345
},
{
"epoch": 0.18857758620689655,
"grad_norm": 2.5145797557443403,
"learning_rate": 9.763926617589883e-06,
"loss": 0.6455,
"step": 350
},
{
"epoch": 0.19127155172413793,
"grad_norm": 2.34228188842774,
"learning_rate": 9.749435731703786e-06,
"loss": 0.6467,
"step": 355
},
{
"epoch": 0.1939655172413793,
"grad_norm": 2.518236951767628,
"learning_rate": 9.734524655902253e-06,
"loss": 0.6651,
"step": 360
},
{
"epoch": 0.1966594827586207,
"grad_norm": 2.3327366524820423,
"learning_rate": 9.719194709391191e-06,
"loss": 0.6527,
"step": 365
},
{
"epoch": 0.19935344827586207,
"grad_norm": 2.6721928236725425,
"learning_rate": 9.70344724843465e-06,
"loss": 0.6471,
"step": 370
},
{
"epoch": 0.20204741379310345,
"grad_norm": 2.512497207087126,
"learning_rate": 9.687283666234823e-06,
"loss": 0.6345,
"step": 375
},
{
"epoch": 0.20474137931034483,
"grad_norm": 2.5381248307269106,
"learning_rate": 9.670705392808796e-06,
"loss": 0.6549,
"step": 380
},
{
"epoch": 0.20743534482758622,
"grad_norm": 2.489609282435604,
"learning_rate": 9.653713894862024e-06,
"loss": 0.6287,
"step": 385
},
{
"epoch": 0.2101293103448276,
"grad_norm": 2.4187969624820767,
"learning_rate": 9.63631067565858e-06,
"loss": 0.6372,
"step": 390
},
{
"epoch": 0.21282327586206898,
"grad_norm": 2.378128543534024,
"learning_rate": 9.618497274888147e-06,
"loss": 0.6344,
"step": 395
},
{
"epoch": 0.21551724137931033,
"grad_norm": 2.3554799136699383,
"learning_rate": 9.600275268529809e-06,
"loss": 0.632,
"step": 400
},
{
"epoch": 0.2182112068965517,
"grad_norm": 2.9669359679831437,
"learning_rate": 9.58164626871261e-06,
"loss": 0.6409,
"step": 405
},
{
"epoch": 0.2209051724137931,
"grad_norm": 2.510424077340063,
"learning_rate": 9.562611923572944e-06,
"loss": 0.6316,
"step": 410
},
{
"epoch": 0.22359913793103448,
"grad_norm": 2.5067266793187843,
"learning_rate": 9.543173917108725e-06,
"loss": 0.6337,
"step": 415
},
{
"epoch": 0.22629310344827586,
"grad_norm": 2.4014165442615627,
"learning_rate": 9.523333969030413e-06,
"loss": 0.6285,
"step": 420
},
{
"epoch": 0.22898706896551724,
"grad_norm": 2.5503305669006266,
"learning_rate": 9.503093834608856e-06,
"loss": 0.6297,
"step": 425
},
{
"epoch": 0.23168103448275862,
"grad_norm": 2.683370610867663,
"learning_rate": 9.482455304520013e-06,
"loss": 0.6222,
"step": 430
},
{
"epoch": 0.234375,
"grad_norm": 2.3415254501274156,
"learning_rate": 9.46142020468652e-06,
"loss": 0.6181,
"step": 435
},
{
"epoch": 0.23706896551724138,
"grad_norm": 2.4296203317167513,
"learning_rate": 9.439990396116149e-06,
"loss": 0.6191,
"step": 440
},
{
"epoch": 0.23976293103448276,
"grad_norm": 2.4277540188724833,
"learning_rate": 9.418167774737173e-06,
"loss": 0.6218,
"step": 445
},
{
"epoch": 0.24245689655172414,
"grad_norm": 2.594904022170311,
"learning_rate": 9.395954271230606e-06,
"loss": 0.622,
"step": 450
},
{
"epoch": 0.24515086206896552,
"grad_norm": 2.347098862192039,
"learning_rate": 9.373351850859417e-06,
"loss": 0.6136,
"step": 455
},
{
"epoch": 0.2478448275862069,
"grad_norm": 2.3928008650888204,
"learning_rate": 9.350362513294652e-06,
"loss": 0.6272,
"step": 460
},
{
"epoch": 0.2505387931034483,
"grad_norm": 2.335542398750826,
"learning_rate": 9.326988292438514e-06,
"loss": 0.6245,
"step": 465
},
{
"epoch": 0.25323275862068967,
"grad_norm": 2.3458410101982174,
"learning_rate": 9.30323125624443e-06,
"loss": 0.6176,
"step": 470
},
{
"epoch": 0.25592672413793105,
"grad_norm": 2.5491037378725188,
"learning_rate": 9.279093506534085e-06,
"loss": 0.6039,
"step": 475
},
{
"epoch": 0.25862068965517243,
"grad_norm": 2.35768113596503,
"learning_rate": 9.254577178811482e-06,
"loss": 0.6062,
"step": 480
},
{
"epoch": 0.2613146551724138,
"grad_norm": 2.4427975704018072,
"learning_rate": 9.229684442074005e-06,
"loss": 0.6038,
"step": 485
},
{
"epoch": 0.2640086206896552,
"grad_norm": 2.3518303928123183,
"learning_rate": 9.204417498620521e-06,
"loss": 0.6071,
"step": 490
},
{
"epoch": 0.2667025862068966,
"grad_norm": 2.3978894249163285,
"learning_rate": 9.178778583856552e-06,
"loss": 0.6024,
"step": 495
},
{
"epoch": 0.26939655172413796,
"grad_norm": 2.530047013657598,
"learning_rate": 9.152769966096483e-06,
"loss": 0.6028,
"step": 500
},
{
"epoch": 0.27209051724137934,
"grad_norm": 2.4123317555719708,
"learning_rate": 9.126393946362906e-06,
"loss": 0.6083,
"step": 505
},
{
"epoch": 0.27478448275862066,
"grad_norm": 2.4793056830777753,
"learning_rate": 9.099652858183027e-06,
"loss": 0.6051,
"step": 510
},
{
"epoch": 0.27747844827586204,
"grad_norm": 2.372688897527012,
"learning_rate": 9.072549067382225e-06,
"loss": 0.6157,
"step": 515
},
{
"epoch": 0.2801724137931034,
"grad_norm": 2.380240348074666,
"learning_rate": 9.045084971874738e-06,
"loss": 0.6073,
"step": 520
},
{
"epoch": 0.2828663793103448,
"grad_norm": 2.545807161286919,
"learning_rate": 9.017263001451518e-06,
"loss": 0.5884,
"step": 525
},
{
"epoch": 0.2855603448275862,
"grad_norm": 2.5935659051260824,
"learning_rate": 8.989085617565261e-06,
"loss": 0.5983,
"step": 530
},
{
"epoch": 0.28825431034482757,
"grad_norm": 2.2548884783469836,
"learning_rate": 8.960555313112646e-06,
"loss": 0.5895,
"step": 535
},
{
"epoch": 0.29094827586206895,
"grad_norm": 2.3534621434136533,
"learning_rate": 8.93167461221378e-06,
"loss": 0.5914,
"step": 540
},
{
"epoch": 0.29364224137931033,
"grad_norm": 2.5336260688373495,
"learning_rate": 8.902446069988878e-06,
"loss": 0.5939,
"step": 545
},
{
"epoch": 0.2963362068965517,
"grad_norm": 2.624683890197873,
"learning_rate": 8.87287227233222e-06,
"loss": 0.5836,
"step": 550
},
{
"epoch": 0.2990301724137931,
"grad_norm": 2.3588318708883604,
"learning_rate": 8.842955835683368e-06,
"loss": 0.5786,
"step": 555
},
{
"epoch": 0.3017241379310345,
"grad_norm": 2.501675897313923,
"learning_rate": 8.812699406795683e-06,
"loss": 0.5799,
"step": 560
},
{
"epoch": 0.30441810344827586,
"grad_norm": 2.6078839400922424,
"learning_rate": 8.78210566250216e-06,
"loss": 0.5801,
"step": 565
},
{
"epoch": 0.30711206896551724,
"grad_norm": 2.3496389383543135,
"learning_rate": 8.751177309478618e-06,
"loss": 0.5756,
"step": 570
},
{
"epoch": 0.3098060344827586,
"grad_norm": 2.3002443057548727,
"learning_rate": 8.71991708400422e-06,
"loss": 0.5823,
"step": 575
},
{
"epoch": 0.3125,
"grad_norm": 2.368311996486066,
"learning_rate": 8.688327751719403e-06,
"loss": 0.57,
"step": 580
},
{
"epoch": 0.3151939655172414,
"grad_norm": 2.316476591326147,
"learning_rate": 8.656412107381187e-06,
"loss": 0.572,
"step": 585
},
{
"epoch": 0.31788793103448276,
"grad_norm": 2.648056237571166,
"learning_rate": 8.624172974615926e-06,
"loss": 0.5759,
"step": 590
},
{
"epoch": 0.32058189655172414,
"grad_norm": 2.5273275022283035,
"learning_rate": 8.591613205669494e-06,
"loss": 0.5751,
"step": 595
},
{
"epoch": 0.3232758620689655,
"grad_norm": 2.3674743965920433,
"learning_rate": 8.558735681154944e-06,
"loss": 0.5525,
"step": 600
},
{
"epoch": 0.3259698275862069,
"grad_norm": 2.334754085556647,
"learning_rate": 8.525543309797653e-06,
"loss": 0.5501,
"step": 605
},
{
"epoch": 0.3286637931034483,
"grad_norm": 2.511690588702945,
"learning_rate": 8.492039028177985e-06,
"loss": 0.5703,
"step": 610
},
{
"epoch": 0.33135775862068967,
"grad_norm": 2.41344799771138,
"learning_rate": 8.458225800471492e-06,
"loss": 0.5674,
"step": 615
},
{
"epoch": 0.33405172413793105,
"grad_norm": 2.274991518802859,
"learning_rate": 8.424106618186653e-06,
"loss": 0.568,
"step": 620
},
{
"epoch": 0.33674568965517243,
"grad_norm": 2.2914893865907375,
"learning_rate": 8.389684499900231e-06,
"loss": 0.5578,
"step": 625
},
{
"epoch": 0.3394396551724138,
"grad_norm": 2.2271331744770175,
"learning_rate": 8.354962490990202e-06,
"loss": 0.554,
"step": 630
},
{
"epoch": 0.3421336206896552,
"grad_norm": 2.346436964348071,
"learning_rate": 8.319943663366325e-06,
"loss": 0.5623,
"step": 635
},
{
"epoch": 0.3448275862068966,
"grad_norm": 2.2365182629879707,
"learning_rate": 8.284631115198371e-06,
"loss": 0.5534,
"step": 640
},
{
"epoch": 0.34752155172413796,
"grad_norm": 2.461241222937466,
"learning_rate": 8.24902797064203e-06,
"loss": 0.5564,
"step": 645
},
{
"epoch": 0.35021551724137934,
"grad_norm": 2.442140982131872,
"learning_rate": 8.213137379562486e-06,
"loss": 0.5506,
"step": 650
},
{
"epoch": 0.35290948275862066,
"grad_norm": 2.388325267487531,
"learning_rate": 8.176962517255776e-06,
"loss": 0.5531,
"step": 655
},
{
"epoch": 0.35560344827586204,
"grad_norm": 2.398524248781268,
"learning_rate": 8.140506584167845e-06,
"loss": 0.5415,
"step": 660
},
{
"epoch": 0.3582974137931034,
"grad_norm": 2.566763693618945,
"learning_rate": 8.103772805611403e-06,
"loss": 0.5616,
"step": 665
},
{
"epoch": 0.3609913793103448,
"grad_norm": 2.3106768834034805,
"learning_rate": 8.066764431480584e-06,
"loss": 0.5328,
"step": 670
},
{
"epoch": 0.3636853448275862,
"grad_norm": 2.2940366514378425,
"learning_rate": 8.029484735963409e-06,
"loss": 0.5452,
"step": 675
},
{
"epoch": 0.36637931034482757,
"grad_norm": 2.4096028111246652,
"learning_rate": 7.991937017252127e-06,
"loss": 0.5448,
"step": 680
},
{
"epoch": 0.36907327586206895,
"grad_norm": 2.450510234216877,
"learning_rate": 7.95412459725141e-06,
"loss": 0.5407,
"step": 685
},
{
"epoch": 0.37176724137931033,
"grad_norm": 2.498635611862816,
"learning_rate": 7.916050821284462e-06,
"loss": 0.536,
"step": 690
},
{
"epoch": 0.3744612068965517,
"grad_norm": 2.3384557737181306,
"learning_rate": 7.877719057797055e-06,
"loss": 0.5404,
"step": 695
},
{
"epoch": 0.3771551724137931,
"grad_norm": 2.395634299723523,
"learning_rate": 7.839132698059515e-06,
"loss": 0.5469,
"step": 700
},
{
"epoch": 0.3798491379310345,
"grad_norm": 2.528299315994187,
"learning_rate": 7.800295155866688e-06,
"loss": 0.5272,
"step": 705
},
{
"epoch": 0.38254310344827586,
"grad_norm": 2.383516192036904,
"learning_rate": 7.761209867235924e-06,
"loss": 0.5495,
"step": 710
},
{
"epoch": 0.38523706896551724,
"grad_norm": 2.3221638101603954,
"learning_rate": 7.721880290103082e-06,
"loss": 0.5517,
"step": 715
},
{
"epoch": 0.3879310344827586,
"grad_norm": 2.451275702370551,
"learning_rate": 7.6823099040166e-06,
"loss": 0.5195,
"step": 720
},
{
"epoch": 0.390625,
"grad_norm": 2.469988525493039,
"learning_rate": 7.64250220982966e-06,
"loss": 0.5151,
"step": 725
},
{
"epoch": 0.3933189655172414,
"grad_norm": 2.4698654498618016,
"learning_rate": 7.602460729390455e-06,
"loss": 0.5296,
"step": 730
},
{
"epoch": 0.39601293103448276,
"grad_norm": 2.433689149450146,
"learning_rate": 7.562189005230609e-06,
"loss": 0.5122,
"step": 735
},
{
"epoch": 0.39870689655172414,
"grad_norm": 2.317764828643439,
"learning_rate": 7.521690600251765e-06,
"loss": 0.5389,
"step": 740
},
{
"epoch": 0.4014008620689655,
"grad_norm": 2.3785211168925997,
"learning_rate": 7.480969097410369e-06,
"loss": 0.5342,
"step": 745
},
{
"epoch": 0.4040948275862069,
"grad_norm": 2.352268614869421,
"learning_rate": 7.4400280994006765e-06,
"loss": 0.5222,
"step": 750
},
{
"epoch": 0.4067887931034483,
"grad_norm": 2.3334817294609844,
"learning_rate": 7.398871228336022e-06,
"loss": 0.5148,
"step": 755
},
{
"epoch": 0.40948275862068967,
"grad_norm": 2.2180745679186513,
"learning_rate": 7.357502125428359e-06,
"loss": 0.5269,
"step": 760
},
{
"epoch": 0.41217672413793105,
"grad_norm": 2.4024098190438448,
"learning_rate": 7.315924450666129e-06,
"loss": 0.5252,
"step": 765
},
{
"epoch": 0.41487068965517243,
"grad_norm": 2.4847050155908326,
"learning_rate": 7.274141882490435e-06,
"loss": 0.5215,
"step": 770
},
{
"epoch": 0.4175646551724138,
"grad_norm": 2.3489603723016423,
"learning_rate": 7.23215811746963e-06,
"loss": 0.5331,
"step": 775
},
{
"epoch": 0.4202586206896552,
"grad_norm": 2.3846378852084276,
"learning_rate": 7.189976869972249e-06,
"loss": 0.526,
"step": 780
},
{
"epoch": 0.4229525862068966,
"grad_norm": 2.2721960920466087,
"learning_rate": 7.147601871838419e-06,
"loss": 0.5111,
"step": 785
},
{
"epoch": 0.42564655172413796,
"grad_norm": 2.242972711736404,
"learning_rate": 7.105036872049676e-06,
"loss": 0.5079,
"step": 790
},
{
"epoch": 0.42834051724137934,
"grad_norm": 2.5168627834860944,
"learning_rate": 7.0622856363973e-06,
"loss": 0.5037,
"step": 795
},
{
"epoch": 0.43103448275862066,
"grad_norm": 2.3034024680284797,
"learning_rate": 7.019351947149149e-06,
"loss": 0.5037,
"step": 800
},
{
"epoch": 0.43372844827586204,
"grad_norm": 2.3169182311354204,
"learning_rate": 6.976239602715025e-06,
"loss": 0.5244,
"step": 805
},
{
"epoch": 0.4364224137931034,
"grad_norm": 2.342523099764779,
"learning_rate": 6.932952417310634e-06,
"loss": 0.4955,
"step": 810
},
{
"epoch": 0.4391163793103448,
"grad_norm": 2.4079674615936213,
"learning_rate": 6.889494220620135e-06,
"loss": 0.5039,
"step": 815
},
{
"epoch": 0.4418103448275862,
"grad_norm": 2.2705187143965704,
"learning_rate": 6.8458688574573164e-06,
"loss": 0.4921,
"step": 820
},
{
"epoch": 0.44450431034482757,
"grad_norm": 2.3040634798061053,
"learning_rate": 6.8020801874254425e-06,
"loss": 0.4952,
"step": 825
},
{
"epoch": 0.44719827586206895,
"grad_norm": 2.283780585980132,
"learning_rate": 6.758132084575791e-06,
"loss": 0.5204,
"step": 830
},
{
"epoch": 0.44989224137931033,
"grad_norm": 2.2311658006536175,
"learning_rate": 6.7140284370649015e-06,
"loss": 0.5062,
"step": 835
},
{
"epoch": 0.4525862068965517,
"grad_norm": 2.381000659447914,
"learning_rate": 6.6697731468105985e-06,
"loss": 0.5054,
"step": 840
},
{
"epoch": 0.4552801724137931,
"grad_norm": 2.5645822620698295,
"learning_rate": 6.625370129146771e-06,
"loss": 0.4967,
"step": 845
},
{
"epoch": 0.4579741379310345,
"grad_norm": 2.518018472550615,
"learning_rate": 6.580823312476976e-06,
"loss": 0.5057,
"step": 850
},
{
"epoch": 0.46066810344827586,
"grad_norm": 2.3310109009449937,
"learning_rate": 6.536136637926898e-06,
"loss": 0.4923,
"step": 855
},
{
"epoch": 0.46336206896551724,
"grad_norm": 2.4572949530360235,
"learning_rate": 6.491314058995653e-06,
"loss": 0.4923,
"step": 860
},
{
"epoch": 0.4660560344827586,
"grad_norm": 2.333469399501826,
"learning_rate": 6.446359541206042e-06,
"loss": 0.4984,
"step": 865
},
{
"epoch": 0.46875,
"grad_norm": 2.3170414009513287,
"learning_rate": 6.401277061753689e-06,
"loss": 0.4805,
"step": 870
},
{
"epoch": 0.4714439655172414,
"grad_norm": 2.3105233267502068,
"learning_rate": 6.356070609155188e-06,
"loss": 0.4857,
"step": 875
},
{
"epoch": 0.47413793103448276,
"grad_norm": 2.406900488225167,
"learning_rate": 6.310744182895231e-06,
"loss": 0.474,
"step": 880
},
{
"epoch": 0.47683189655172414,
"grad_norm": 2.3233269304186246,
"learning_rate": 6.265301793072762e-06,
"loss": 0.4947,
"step": 885
},
{
"epoch": 0.4795258620689655,
"grad_norm": 2.336797328678939,
"learning_rate": 6.219747460046203e-06,
"loss": 0.4771,
"step": 890
},
{
"epoch": 0.4822198275862069,
"grad_norm": 2.3058756900360566,
"learning_rate": 6.17408521407776e-06,
"loss": 0.4791,
"step": 895
},
{
"epoch": 0.4849137931034483,
"grad_norm": 2.467884893673803,
"learning_rate": 6.128319094976869e-06,
"loss": 0.492,
"step": 900
},
{
"epoch": 0.48760775862068967,
"grad_norm": 2.3280199883273047,
"learning_rate": 6.0824531517427765e-06,
"loss": 0.4816,
"step": 905
},
{
"epoch": 0.49030172413793105,
"grad_norm": 2.2642826853033053,
"learning_rate": 6.03649144220633e-06,
"loss": 0.4805,
"step": 910
},
{
"epoch": 0.49299568965517243,
"grad_norm": 2.2845546468033007,
"learning_rate": 5.990438032670968e-06,
"loss": 0.4804,
"step": 915
},
{
"epoch": 0.4956896551724138,
"grad_norm": 2.320099011292584,
"learning_rate": 5.944296997552968e-06,
"loss": 0.4807,
"step": 920
},
{
"epoch": 0.4983836206896552,
"grad_norm": 2.4032671750639607,
"learning_rate": 5.898072419020978e-06,
"loss": 0.479,
"step": 925
},
{
"epoch": 0.5010775862068966,
"grad_norm": 2.3454490179654948,
"learning_rate": 5.851768386634863e-06,
"loss": 0.4657,
"step": 930
},
{
"epoch": 0.5037715517241379,
"grad_norm": 2.2272370976346707,
"learning_rate": 5.805388996983891e-06,
"loss": 0.4778,
"step": 935
},
{
"epoch": 0.5064655172413793,
"grad_norm": 2.399429478516486,
"learning_rate": 5.758938353324308e-06,
"loss": 0.4766,
"step": 940
},
{
"epoch": 0.5091594827586207,
"grad_norm": 2.2479225788941726,
"learning_rate": 5.712420565216305e-06,
"loss": 0.4689,
"step": 945
},
{
"epoch": 0.5118534482758621,
"grad_norm": 2.333910684063406,
"learning_rate": 5.66583974816045e-06,
"loss": 0.4689,
"step": 950
},
{
"epoch": 0.5145474137931034,
"grad_norm": 2.494414220923278,
"learning_rate": 5.619200023233582e-06,
"loss": 0.4654,
"step": 955
},
{
"epoch": 0.5172413793103449,
"grad_norm": 2.4303474928270314,
"learning_rate": 5.572505516724207e-06,
"loss": 0.4841,
"step": 960
},
{
"epoch": 0.5199353448275862,
"grad_norm": 2.3290300558522605,
"learning_rate": 5.52576035976744e-06,
"loss": 0.4631,
"step": 965
},
{
"epoch": 0.5226293103448276,
"grad_norm": 2.303763077645539,
"learning_rate": 5.478968687979527e-06,
"loss": 0.4535,
"step": 970
},
{
"epoch": 0.525323275862069,
"grad_norm": 2.3158015015015367,
"learning_rate": 5.432134641091945e-06,
"loss": 0.4653,
"step": 975
},
{
"epoch": 0.5280172413793104,
"grad_norm": 2.412268625727716,
"learning_rate": 5.3852623625851655e-06,
"loss": 0.4553,
"step": 980
},
{
"epoch": 0.5307112068965517,
"grad_norm": 2.4152646593142477,
"learning_rate": 5.338355999322069e-06,
"loss": 0.459,
"step": 985
},
{
"epoch": 0.5334051724137931,
"grad_norm": 2.3009383932051186,
"learning_rate": 5.291419701181069e-06,
"loss": 0.4574,
"step": 990
},
{
"epoch": 0.5360991379310345,
"grad_norm": 2.3404820672273683,
"learning_rate": 5.244457620688962e-06,
"loss": 0.4457,
"step": 995
},
{
"epoch": 0.5387931034482759,
"grad_norm": 2.2918401803413277,
"learning_rate": 5.197473912653549e-06,
"loss": 0.4625,
"step": 1000
},
{
"epoch": 0.5414870689655172,
"grad_norm": 2.330307145203118,
"learning_rate": 5.150472733796053e-06,
"loss": 0.4614,
"step": 1005
},
{
"epoch": 0.5441810344827587,
"grad_norm": 2.317228108453964,
"learning_rate": 5.103458242383371e-06,
"loss": 0.4346,
"step": 1010
},
{
"epoch": 0.546875,
"grad_norm": 2.246449210384358,
"learning_rate": 5.056434597860176e-06,
"loss": 0.4332,
"step": 1015
},
{
"epoch": 0.5495689655172413,
"grad_norm": 2.2315633880832917,
"learning_rate": 5.009405960480937e-06,
"loss": 0.4374,
"step": 1020
},
{
"epoch": 0.5522629310344828,
"grad_norm": 2.236917389881302,
"learning_rate": 4.962376490941846e-06,
"loss": 0.4443,
"step": 1025
},
{
"epoch": 0.5549568965517241,
"grad_norm": 2.2257101057521953,
"learning_rate": 4.915350350012714e-06,
"loss": 0.4485,
"step": 1030
},
{
"epoch": 0.5576508620689655,
"grad_norm": 2.2768475081245696,
"learning_rate": 4.868331698168875e-06,
"loss": 0.456,
"step": 1035
},
{
"epoch": 0.5603448275862069,
"grad_norm": 2.2588873812858243,
"learning_rate": 4.82132469522308e-06,
"loss": 0.4531,
"step": 1040
},
{
"epoch": 0.5630387931034483,
"grad_norm": 2.2517674521156414,
"learning_rate": 4.774333499957488e-06,
"loss": 0.4439,
"step": 1045
},
{
"epoch": 0.5657327586206896,
"grad_norm": 2.3879681903493277,
"learning_rate": 4.727362269755736e-06,
"loss": 0.4507,
"step": 1050
},
{
"epoch": 0.568426724137931,
"grad_norm": 2.2168932530530654,
"learning_rate": 4.68041516023511e-06,
"loss": 0.4436,
"step": 1055
},
{
"epoch": 0.5711206896551724,
"grad_norm": 2.328909950607463,
"learning_rate": 4.633496324878906e-06,
"loss": 0.4408,
"step": 1060
},
{
"epoch": 0.5738146551724138,
"grad_norm": 2.2564887174276183,
"learning_rate": 4.586609914668963e-06,
"loss": 0.4516,
"step": 1065
},
{
"epoch": 0.5765086206896551,
"grad_norm": 2.2979177074885424,
"learning_rate": 4.539760077718416e-06,
"loss": 0.4389,
"step": 1070
},
{
"epoch": 0.5792025862068966,
"grad_norm": 2.2933960847054515,
"learning_rate": 4.492950958904707e-06,
"loss": 0.4266,
"step": 1075
},
{
"epoch": 0.5818965517241379,
"grad_norm": 2.2594325799250594,
"learning_rate": 4.4461866995028776e-06,
"loss": 0.427,
"step": 1080
},
{
"epoch": 0.5845905172413793,
"grad_norm": 2.349659814217747,
"learning_rate": 4.399471436819199e-06,
"loss": 0.4346,
"step": 1085
},
{
"epoch": 0.5872844827586207,
"grad_norm": 2.297930957947952,
"learning_rate": 4.352809303825115e-06,
"loss": 0.4279,
"step": 1090
},
{
"epoch": 0.5899784482758621,
"grad_norm": 2.202712644399629,
"learning_rate": 4.306204428791609e-06,
"loss": 0.4291,
"step": 1095
},
{
"epoch": 0.5926724137931034,
"grad_norm": 2.2128476870439813,
"learning_rate": 4.259660934923965e-06,
"loss": 0.44,
"step": 1100
},
{
"epoch": 0.5953663793103449,
"grad_norm": 2.367627389505961,
"learning_rate": 4.213182939996978e-06,
"loss": 0.4379,
"step": 1105
},
{
"epoch": 0.5980603448275862,
"grad_norm": 2.274117011259563,
"learning_rate": 4.166774555990654e-06,
"loss": 0.4344,
"step": 1110
},
{
"epoch": 0.6007543103448276,
"grad_norm": 2.2261394360036983,
"learning_rate": 4.120439888726407e-06,
"loss": 0.4142,
"step": 1115
},
{
"epoch": 0.603448275862069,
"grad_norm": 2.1852891937100436,
"learning_rate": 4.074183037503827e-06,
"loss": 0.4266,
"step": 1120
},
{
"epoch": 0.6061422413793104,
"grad_norm": 2.3083672939605053,
"learning_rate": 4.028008094737989e-06,
"loss": 0.4394,
"step": 1125
},
{
"epoch": 0.6088362068965517,
"grad_norm": 2.2610041056896963,
"learning_rate": 3.981919145597404e-06,
"loss": 0.4128,
"step": 1130
},
{
"epoch": 0.6115301724137931,
"grad_norm": 2.19751146715402,
"learning_rate": 3.935920267642592e-06,
"loss": 0.4227,
"step": 1135
},
{
"epoch": 0.6142241379310345,
"grad_norm": 2.3415136999781963,
"learning_rate": 3.890015530465342e-06,
"loss": 0.4133,
"step": 1140
},
{
"epoch": 0.6169181034482759,
"grad_norm": 2.291673599344672,
"learning_rate": 3.844208995328659e-06,
"loss": 0.4192,
"step": 1145
},
{
"epoch": 0.6196120689655172,
"grad_norm": 2.2459859353779508,
"learning_rate": 3.7985047148074584e-06,
"loss": 0.4257,
"step": 1150
},
{
"epoch": 0.6223060344827587,
"grad_norm": 2.3753214874892072,
"learning_rate": 3.75290673243004e-06,
"loss": 0.421,
"step": 1155
},
{
"epoch": 0.625,
"grad_norm": 2.181100394703554,
"learning_rate": 3.707419082320336e-06,
"loss": 0.4287,
"step": 1160
},
{
"epoch": 0.6276939655172413,
"grad_norm": 2.242465849693457,
"learning_rate": 3.6620457888410143e-06,
"loss": 0.4143,
"step": 1165
},
{
"epoch": 0.6303879310344828,
"grad_norm": 2.3646959150338813,
"learning_rate": 3.616790866237433e-06,
"loss": 0.4045,
"step": 1170
},
{
"epoch": 0.6330818965517241,
"grad_norm": 2.312802724452316,
"learning_rate": 3.5716583182825023e-06,
"loss": 0.4248,
"step": 1175
},
{
"epoch": 0.6357758620689655,
"grad_norm": 2.208443511882899,
"learning_rate": 3.5266521379224506e-06,
"loss": 0.4135,
"step": 1180
},
{
"epoch": 0.6384698275862069,
"grad_norm": 2.2774985396607046,
"learning_rate": 3.4817763069235747e-06,
"loss": 0.4028,
"step": 1185
},
{
"epoch": 0.6411637931034483,
"grad_norm": 2.3080269121559898,
"learning_rate": 3.4370347955199634e-06,
"loss": 0.4086,
"step": 1190
},
{
"epoch": 0.6438577586206896,
"grad_norm": 2.3130128907712355,
"learning_rate": 3.392431562062238e-06,
"loss": 0.408,
"step": 1195
},
{
"epoch": 0.646551724137931,
"grad_norm": 2.2776700595089676,
"learning_rate": 3.347970552667361e-06,
"loss": 0.4159,
"step": 1200
},
{
"epoch": 0.6492456896551724,
"grad_norm": 2.1524296489308576,
"learning_rate": 3.303655700869507e-06,
"loss": 0.4035,
"step": 1205
},
{
"epoch": 0.6519396551724138,
"grad_norm": 2.2146294105038185,
"learning_rate": 3.259490927272071e-06,
"loss": 0.4012,
"step": 1210
},
{
"epoch": 0.6546336206896551,
"grad_norm": 2.2480654104489752,
"learning_rate": 3.2154801392007883e-06,
"loss": 0.4153,
"step": 1215
},
{
"epoch": 0.6573275862068966,
"grad_norm": 2.169871400965887,
"learning_rate": 3.171627230358063e-06,
"loss": 0.404,
"step": 1220
},
{
"epoch": 0.6600215517241379,
"grad_norm": 2.4015866937415056,
"learning_rate": 3.1279360804784785e-06,
"loss": 0.4063,
"step": 1225
},
{
"epoch": 0.6627155172413793,
"grad_norm": 2.3038799378482557,
"learning_rate": 3.084410554985553e-06,
"loss": 0.3898,
"step": 1230
},
{
"epoch": 0.6654094827586207,
"grad_norm": 2.198625588166285,
"learning_rate": 3.0410545046497553e-06,
"loss": 0.4035,
"step": 1235
},
{
"epoch": 0.6681034482758621,
"grad_norm": 2.1950219963512176,
"learning_rate": 2.9978717652478343e-06,
"loss": 0.3902,
"step": 1240
},
{
"epoch": 0.6707974137931034,
"grad_norm": 2.247458718435766,
"learning_rate": 2.954866157223445e-06,
"loss": 0.4082,
"step": 1245
},
{
"epoch": 0.6734913793103449,
"grad_norm": 2.2241261994844588,
"learning_rate": 2.9120414853491574e-06,
"loss": 0.404,
"step": 1250
},
{
"epoch": 0.6761853448275862,
"grad_norm": 2.1606540598223103,
"learning_rate": 2.86940153838984e-06,
"loss": 0.3948,
"step": 1255
},
{
"epoch": 0.6788793103448276,
"grad_norm": 2.0718054651873437,
"learning_rate": 2.826950088767469e-06,
"loss": 0.3927,
"step": 1260
},
{
"epoch": 0.681573275862069,
"grad_norm": 2.227847088159035,
"learning_rate": 2.784690892227363e-06,
"loss": 0.3903,
"step": 1265
},
{
"epoch": 0.6842672413793104,
"grad_norm": 2.207892303296737,
"learning_rate": 2.7426276875059145e-06,
"loss": 0.3955,
"step": 1270
},
{
"epoch": 0.6869612068965517,
"grad_norm": 2.1465153515114093,
"learning_rate": 2.700764195999819e-06,
"loss": 0.3788,
"step": 1275
},
{
"epoch": 0.6896551724137931,
"grad_norm": 2.223157201107058,
"learning_rate": 2.6591041214368383e-06,
"loss": 0.4053,
"step": 1280
},
{
"epoch": 0.6923491379310345,
"grad_norm": 2.392548147708553,
"learning_rate": 2.6176511495481172e-06,
"loss": 0.3834,
"step": 1285
},
{
"epoch": 0.6950431034482759,
"grad_norm": 2.059476074487736,
"learning_rate": 2.5764089477421067e-06,
"loss": 0.3857,
"step": 1290
},
{
"epoch": 0.6977370689655172,
"grad_norm": 2.157455657651667,
"learning_rate": 2.5353811647801107e-06,
"loss": 0.3884,
"step": 1295
},
{
"epoch": 0.7004310344827587,
"grad_norm": 2.307643086382308,
"learning_rate": 2.4945714304534584e-06,
"loss": 0.3815,
"step": 1300
},
{
"epoch": 0.703125,
"grad_norm": 2.26315069416342,
"learning_rate": 2.453983355262382e-06,
"loss": 0.3865,
"step": 1305
},
{
"epoch": 0.7058189655172413,
"grad_norm": 2.332313222729813,
"learning_rate": 2.413620530096592e-06,
"loss": 0.391,
"step": 1310
},
{
"epoch": 0.7085129310344828,
"grad_norm": 2.1418117590999413,
"learning_rate": 2.373486525917575e-06,
"loss": 0.3912,
"step": 1315
},
{
"epoch": 0.7112068965517241,
"grad_norm": 2.178180423311831,
"learning_rate": 2.333584893442675e-06,
"loss": 0.3854,
"step": 1320
},
{
"epoch": 0.7139008620689655,
"grad_norm": 2.151591142836586,
"learning_rate": 2.2939191628309482e-06,
"loss": 0.3815,
"step": 1325
},
{
"epoch": 0.7165948275862069,
"grad_norm": 2.1488408048158916,
"learning_rate": 2.254492843370857e-06,
"loss": 0.3741,
"step": 1330
},
{
"epoch": 0.7192887931034483,
"grad_norm": 2.3225770656541624,
"learning_rate": 2.2153094231697807e-06,
"loss": 0.3865,
"step": 1335
},
{
"epoch": 0.7219827586206896,
"grad_norm": 2.225461569667121,
"learning_rate": 2.1763723688454297e-06,
"loss": 0.389,
"step": 1340
},
{
"epoch": 0.724676724137931,
"grad_norm": 2.310688191216032,
"learning_rate": 2.1376851252191465e-06,
"loss": 0.3905,
"step": 1345
},
{
"epoch": 0.7273706896551724,
"grad_norm": 2.206817710811153,
"learning_rate": 2.09925111501113e-06,
"loss": 0.3705,
"step": 1350
},
{
"epoch": 0.7300646551724138,
"grad_norm": 2.194541840528301,
"learning_rate": 2.061073738537635e-06,
"loss": 0.38,
"step": 1355
},
{
"epoch": 0.7327586206896551,
"grad_norm": 2.1363777762782568,
"learning_rate": 2.0231563734101245e-06,
"loss": 0.3826,
"step": 1360
},
{
"epoch": 0.7354525862068966,
"grad_norm": 2.043722143372559,
"learning_rate": 1.9855023742364647e-06,
"loss": 0.3722,
"step": 1365
},
{
"epoch": 0.7381465517241379,
"grad_norm": 2.296022903294665,
"learning_rate": 1.9481150723241236e-06,
"loss": 0.3836,
"step": 1370
},
{
"epoch": 0.7408405172413793,
"grad_norm": 2.1320085273295333,
"learning_rate": 1.9109977753854496e-06,
"loss": 0.367,
"step": 1375
},
{
"epoch": 0.7435344827586207,
"grad_norm": 2.126131429150438,
"learning_rate": 1.8741537672450406e-06,
"loss": 0.3756,
"step": 1380
},
{
"epoch": 0.7462284482758621,
"grad_norm": 2.3054341669665708,
"learning_rate": 1.8375863075492062e-06,
"loss": 0.3737,
"step": 1385
},
{
"epoch": 0.7489224137931034,
"grad_norm": 2.3340813640902867,
"learning_rate": 1.8012986314775888e-06,
"loss": 0.3694,
"step": 1390
},
{
"epoch": 0.7516163793103449,
"grad_norm": 2.1335614766566544,
"learning_rate": 1.7652939494569428e-06,
"loss": 0.3706,
"step": 1395
},
{
"epoch": 0.7543103448275862,
"grad_norm": 2.135867482259856,
"learning_rate": 1.7295754468771026e-06,
"loss": 0.3826,
"step": 1400
},
{
"epoch": 0.7570043103448276,
"grad_norm": 2.253239028561062,
"learning_rate": 1.6941462838091643e-06,
"loss": 0.3879,
"step": 1405
},
{
"epoch": 0.759698275862069,
"grad_norm": 2.1899554008641613,
"learning_rate": 1.6590095947259083e-06,
"loss": 0.3657,
"step": 1410
},
{
"epoch": 0.7623922413793104,
"grad_norm": 1.9335639886365577,
"learning_rate": 1.6241684882244952e-06,
"loss": 0.3647,
"step": 1415
},
{
"epoch": 0.7650862068965517,
"grad_norm": 2.158271364922754,
"learning_rate": 1.5896260467514335e-06,
"loss": 0.3613,
"step": 1420
},
{
"epoch": 0.7677801724137931,
"grad_norm": 2.283426548356461,
"learning_rate": 1.5553853263298741e-06,
"loss": 0.3804,
"step": 1425
},
{
"epoch": 0.7704741379310345,
"grad_norm": 1.973245710047114,
"learning_rate": 1.521449356289245e-06,
"loss": 0.3616,
"step": 1430
},
{
"epoch": 0.7731681034482759,
"grad_norm": 2.176003470736959,
"learning_rate": 1.4878211389972369e-06,
"loss": 0.3594,
"step": 1435
},
{
"epoch": 0.7758620689655172,
"grad_norm": 2.350333157030792,
"learning_rate": 1.454503649594176e-06,
"loss": 0.3745,
"step": 1440
},
{
"epoch": 0.7785560344827587,
"grad_norm": 2.1046600168472254,
"learning_rate": 1.421499835729812e-06,
"loss": 0.3614,
"step": 1445
},
{
"epoch": 0.78125,
"grad_norm": 2.2403959550973376,
"learning_rate": 1.3888126173025412e-06,
"loss": 0.3667,
"step": 1450
},
{
"epoch": 0.7839439655172413,
"grad_norm": 2.2036204076799244,
"learning_rate": 1.3564448862010653e-06,
"loss": 0.3719,
"step": 1455
},
{
"epoch": 0.7866379310344828,
"grad_norm": 2.1004023468667223,
"learning_rate": 1.3243995060485537e-06,
"loss": 0.3609,
"step": 1460
},
{
"epoch": 0.7893318965517241,
"grad_norm": 2.049485866619644,
"learning_rate": 1.2926793119492848e-06,
"loss": 0.3562,
"step": 1465
},
{
"epoch": 0.7920258620689655,
"grad_norm": 2.2562907662057015,
"learning_rate": 1.2612871102378305e-06,
"loss": 0.3638,
"step": 1470
},
{
"epoch": 0.7947198275862069,
"grad_norm": 2.0015131375954045,
"learning_rate": 1.230225678230766e-06,
"loss": 0.3523,
"step": 1475
},
{
"epoch": 0.7974137931034483,
"grad_norm": 1.9761111123797053,
"learning_rate": 1.1994977639809575e-06,
"loss": 0.3605,
"step": 1480
},
{
"epoch": 0.8001077586206896,
"grad_norm": 2.1818297029398916,
"learning_rate": 1.169106086034446e-06,
"loss": 0.369,
"step": 1485
},
{
"epoch": 0.802801724137931,
"grad_norm": 2.2176123875649782,
"learning_rate": 1.1390533331899235e-06,
"loss": 0.359,
"step": 1490
},
{
"epoch": 0.8054956896551724,
"grad_norm": 2.1415950875401952,
"learning_rate": 1.109342164260853e-06,
"loss": 0.365,
"step": 1495
},
{
"epoch": 0.8081896551724138,
"grad_norm": 1.9579230862394106,
"learning_rate": 1.079975207840247e-06,
"loss": 0.3475,
"step": 1500
},
{
"epoch": 0.8108836206896551,
"grad_norm": 1.9891326864430916,
"learning_rate": 1.050955062068098e-06,
"loss": 0.3636,
"step": 1505
},
{
"epoch": 0.8135775862068966,
"grad_norm": 2.1589113372475826,
"learning_rate": 1.0222842944015327e-06,
"loss": 0.3637,
"step": 1510
},
{
"epoch": 0.8162715517241379,
"grad_norm": 2.2093770653678817,
"learning_rate": 9.939654413876493e-07,
"loss": 0.3704,
"step": 1515
},
{
"epoch": 0.8189655172413793,
"grad_norm": 2.117779906161616,
"learning_rate": 9.660010084391197e-07,
"loss": 0.3549,
"step": 1520
},
{
"epoch": 0.8216594827586207,
"grad_norm": 2.2081164429406623,
"learning_rate": 9.383934696125213e-07,
"loss": 0.3637,
"step": 1525
},
{
"epoch": 0.8243534482758621,
"grad_norm": 2.0797066327192915,
"learning_rate": 9.111452673894589e-07,
"loss": 0.355,
"step": 1530
},
{
"epoch": 0.8270474137931034,
"grad_norm": 1.9884207565802496,
"learning_rate": 8.842588124604695e-07,
"loss": 0.3598,
"step": 1535
},
{
"epoch": 0.8297413793103449,
"grad_norm": 1.9966503677289194,
"learning_rate": 8.577364835117552e-07,
"loss": 0.3503,
"step": 1540
},
{
"epoch": 0.8324353448275862,
"grad_norm": 2.0974426601893006,
"learning_rate": 8.315806270147237e-07,
"loss": 0.3513,
"step": 1545
},
{
"epoch": 0.8351293103448276,
"grad_norm": 2.0409953572157264,
"learning_rate": 8.057935570184e-07,
"loss": 0.353,
"step": 1550
},
{
"epoch": 0.837823275862069,
"grad_norm": 2.05994767546201,
"learning_rate": 7.803775549447017e-07,
"loss": 0.3612,
"step": 1555
},
{
"epoch": 0.8405172413793104,
"grad_norm": 1.9798689534701572,
"learning_rate": 7.553348693865897e-07,
"loss": 0.3433,
"step": 1560
},
{
"epoch": 0.8432112068965517,
"grad_norm": 2.0314728151818557,
"learning_rate": 7.306677159091385e-07,
"loss": 0.3554,
"step": 1565
},
{
"epoch": 0.8459051724137931,
"grad_norm": 2.1770521072409665,
"learning_rate": 7.06378276853516e-07,
"loss": 0.3434,
"step": 1570
},
{
"epoch": 0.8485991379310345,
"grad_norm": 3.199094357987707,
"learning_rate": 6.824687011439168e-07,
"loss": 0.3555,
"step": 1575
},
{
"epoch": 0.8512931034482759,
"grad_norm": 2.0350410942770267,
"learning_rate": 6.589411040974369e-07,
"loss": 0.3455,
"step": 1580
},
{
"epoch": 0.8539870689655172,
"grad_norm": 2.0106939788979994,
"learning_rate": 6.35797567236926e-07,
"loss": 0.342,
"step": 1585
},
{
"epoch": 0.8566810344827587,
"grad_norm": 2.0462922997663333,
"learning_rate": 6.130401381068424e-07,
"loss": 0.3484,
"step": 1590
},
{
"epoch": 0.859375,
"grad_norm": 1.9989302742973973,
"learning_rate": 5.906708300920916e-07,
"loss": 0.358,
"step": 1595
},
{
"epoch": 0.8620689655172413,
"grad_norm": 2.1421705464248997,
"learning_rate": 5.686916222399069e-07,
"loss": 0.3479,
"step": 1600
},
{
"epoch": 0.8647629310344828,
"grad_norm": 1.8665911668349293,
"learning_rate": 5.471044590847569e-07,
"loss": 0.3485,
"step": 1605
},
{
"epoch": 0.8674568965517241,
"grad_norm": 2.252328311927183,
"learning_rate": 5.259112504763115e-07,
"loss": 0.3537,
"step": 1610
},
{
"epoch": 0.8701508620689655,
"grad_norm": 2.242291713625665,
"learning_rate": 5.051138714104726e-07,
"loss": 0.3493,
"step": 1615
},
{
"epoch": 0.8728448275862069,
"grad_norm": 1.9256177965601142,
"learning_rate": 4.847141618634899e-07,
"loss": 0.346,
"step": 1620
},
{
"epoch": 0.8755387931034483,
"grad_norm": 2.0978920858884806,
"learning_rate": 4.647139266291789e-07,
"loss": 0.3447,
"step": 1625
},
{
"epoch": 0.8782327586206896,
"grad_norm": 2.1438665447656424,
"learning_rate": 4.4511493515924373e-07,
"loss": 0.3467,
"step": 1630
},
{
"epoch": 0.880926724137931,
"grad_norm": 1.943275187391926,
"learning_rate": 4.2591892140673383e-07,
"loss": 0.359,
"step": 1635
},
{
"epoch": 0.8836206896551724,
"grad_norm": 1.9691693184683765,
"learning_rate": 4.0712758367263573e-07,
"loss": 0.3453,
"step": 1640
},
{
"epoch": 0.8863146551724138,
"grad_norm": 2.2550989234096432,
"learning_rate": 3.8874258445562694e-07,
"loss": 0.354,
"step": 1645
},
{
"epoch": 0.8890086206896551,
"grad_norm": 1.9743645882114702,
"learning_rate": 3.7076555030498505e-07,
"loss": 0.3545,
"step": 1650
},
{
"epoch": 0.8917025862068966,
"grad_norm": 2.069394313953148,
"learning_rate": 3.531980716766914e-07,
"loss": 0.3465,
"step": 1655
},
{
"epoch": 0.8943965517241379,
"grad_norm": 2.084992853821571,
"learning_rate": 3.3604170279271375e-07,
"loss": 0.347,
"step": 1660
},
{
"epoch": 0.8970905172413793,
"grad_norm": 2.028486932834069,
"learning_rate": 3.1929796150351076e-07,
"loss": 0.3385,
"step": 1665
},
{
"epoch": 0.8997844827586207,
"grad_norm": 1.9042104552013777,
"learning_rate": 3.02968329153735e-07,
"loss": 0.3456,
"step": 1670
},
{
"epoch": 0.9024784482758621,
"grad_norm": 2.138202184025318,
"learning_rate": 2.870542504511864e-07,
"loss": 0.3524,
"step": 1675
},
{
"epoch": 0.9051724137931034,
"grad_norm": 2.0791032572613615,
"learning_rate": 2.7155713333898826e-07,
"loss": 0.3557,
"step": 1680
},
{
"epoch": 0.9078663793103449,
"grad_norm": 2.032552582124559,
"learning_rate": 2.564783488710293e-07,
"loss": 0.3472,
"step": 1685
},
{
"epoch": 0.9105603448275862,
"grad_norm": 2.0702198858374063,
"learning_rate": 2.4181923109066254e-07,
"loss": 0.3423,
"step": 1690
},
{
"epoch": 0.9132543103448276,
"grad_norm": 2.223955152789369,
"learning_rate": 2.2758107691268294e-07,
"loss": 0.353,
"step": 1695
},
{
"epoch": 0.915948275862069,
"grad_norm": 2.151000423198189,
"learning_rate": 2.1376514600858212e-07,
"loss": 0.3446,
"step": 1700
},
{
"epoch": 0.9186422413793104,
"grad_norm": 1.9722858881802758,
"learning_rate": 2.003726606951084e-07,
"loss": 0.3423,
"step": 1705
},
{
"epoch": 0.9213362068965517,
"grad_norm": 2.152676598806774,
"learning_rate": 1.874048058261252e-07,
"loss": 0.3566,
"step": 1710
},
{
"epoch": 0.9240301724137931,
"grad_norm": 2.14241065854355,
"learning_rate": 1.7486272868778299e-07,
"loss": 0.3451,
"step": 1715
},
{
"epoch": 0.9267241379310345,
"grad_norm": 1.9240645550272026,
"learning_rate": 1.62747538897019e-07,
"loss": 0.3526,
"step": 1720
},
{
"epoch": 0.9294181034482759,
"grad_norm": 1.9864527165081682,
"learning_rate": 1.5106030830338791e-07,
"loss": 0.3414,
"step": 1725
},
{
"epoch": 0.9321120689655172,
"grad_norm": 1.891840587890648,
"learning_rate": 1.3980207089423326e-07,
"loss": 0.3507,
"step": 1730
},
{
"epoch": 0.9348060344827587,
"grad_norm": 2.197241548310695,
"learning_rate": 1.2897382270320947e-07,
"loss": 0.3415,
"step": 1735
},
{
"epoch": 0.9375,
"grad_norm": 2.1206142876832708,
"learning_rate": 1.1857652172215905e-07,
"loss": 0.3453,
"step": 1740
},
{
"epoch": 0.9401939655172413,
"grad_norm": 2.0575425778092375,
"learning_rate": 1.0861108781636099e-07,
"loss": 0.3414,
"step": 1745
},
{
"epoch": 0.9428879310344828,
"grad_norm": 2.067217232750268,
"learning_rate": 9.907840264314572e-08,
"loss": 0.3429,
"step": 1750
},
{
"epoch": 0.9455818965517241,
"grad_norm": 2.08954775323305,
"learning_rate": 8.997930957389433e-08,
"loss": 0.3406,
"step": 1755
},
{
"epoch": 0.9482758620689655,
"grad_norm": 2.0413104358527865,
"learning_rate": 8.13146136194265e-08,
"loss": 0.3544,
"step": 1760
},
{
"epoch": 0.9509698275862069,
"grad_norm": 1.9504574949587095,
"learning_rate": 7.308508135877745e-08,
"loss": 0.3515,
"step": 1765
},
{
"epoch": 0.9536637931034483,
"grad_norm": 2.0325177039467266,
"learning_rate": 6.52914408713784e-08,
"loss": 0.3422,
"step": 1770
},
{
"epoch": 0.9563577586206896,
"grad_norm": 2.080402951454278,
"learning_rate": 5.7934381672640206e-08,
"loss": 0.3302,
"step": 1775
},
{
"epoch": 0.959051724137931,
"grad_norm": 1.9103094146698458,
"learning_rate": 5.101455465295557e-08,
"loss": 0.3388,
"step": 1780
},
{
"epoch": 0.9617456896551724,
"grad_norm": 2.0461617336274665,
"learning_rate": 4.453257202011008e-08,
"loss": 0.3437,
"step": 1785
},
{
"epoch": 0.9644396551724138,
"grad_norm": 1.8955751541723638,
"learning_rate": 3.848900724511828e-08,
"loss": 0.3448,
"step": 1790
},
{
"epoch": 0.9671336206896551,
"grad_norm": 1.8502858059698502,
"learning_rate": 3.28843950114921e-08,
"loss": 0.3318,
"step": 1795
},
{
"epoch": 0.9698275862068966,
"grad_norm": 1.9634830726403167,
"learning_rate": 2.771923116793307e-08,
"loss": 0.3506,
"step": 1800
},
{
"epoch": 0.9725215517241379,
"grad_norm": 2.12551984941854,
"learning_rate": 2.299397268446413e-08,
"loss": 0.3425,
"step": 1805
},
{
"epoch": 0.9752155172413793,
"grad_norm": 2.4278727464472136,
"learning_rate": 1.8709037612003044e-08,
"loss": 0.3471,
"step": 1810
},
{
"epoch": 0.9779094827586207,
"grad_norm": 2.191866602098634,
"learning_rate": 1.4864805045373687e-08,
"loss": 0.3384,
"step": 1815
},
{
"epoch": 0.9806034482758621,
"grad_norm": 2.128906961450063,
"learning_rate": 1.1461615089770062e-08,
"loss": 0.349,
"step": 1820
},
{
"epoch": 0.9832974137931034,
"grad_norm": 2.0846719469136916,
"learning_rate": 8.499768830663723e-09,
"loss": 0.3357,
"step": 1825
},
{
"epoch": 0.9859913793103449,
"grad_norm": 2.319036063763146,
"learning_rate": 5.979528307168414e-09,
"loss": 0.3402,
"step": 1830
},
{
"epoch": 0.9886853448275862,
"grad_norm": 2.0237346794749858,
"learning_rate": 3.901116488855827e-09,
"loss": 0.3554,
"step": 1835
},
{
"epoch": 0.9913793103448276,
"grad_norm": 2.007135214089839,
"learning_rate": 2.264717256030835e-09,
"loss": 0.3462,
"step": 1840
},
{
"epoch": 0.994073275862069,
"grad_norm": 1.994084875393067,
"learning_rate": 1.0704753834600567e-09,
"loss": 0.3455,
"step": 1845
},
{
"epoch": 0.9967672413793104,
"grad_norm": 2.1211709513233856,
"learning_rate": 3.184965275676577e-10,
"loss": 0.3438,
"step": 1850
},
{
"epoch": 0.9994612068965517,
"grad_norm": 2.0397937800653443,
"learning_rate": 8.847217084495541e-12,
"loss": 0.3482,
"step": 1855
},
{
"epoch": 1.0,
"eval_runtime": 3.3988,
"eval_samples_per_second": 2.942,
"eval_steps_per_second": 0.883,
"step": 1856
},
{
"epoch": 1.0,
"step": 1856,
"total_flos": 194304320471040.0,
"train_loss": 0.50882549257949,
"train_runtime": 16510.7518,
"train_samples_per_second": 1.799,
"train_steps_per_second": 0.112
}
],
"logging_steps": 5,
"max_steps": 1856,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 194304320471040.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}