{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9996830427892235,
  "eval_steps": 200,
  "global_step": 1577,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0006339144215530904,
      "grad_norm": 0.1844951284226507,
      "learning_rate": 1.2658227848101265e-06,
      "loss": 0.2445,
      "step": 1
    },
    {
      "epoch": 0.003169572107765452,
      "grad_norm": 0.4228474323435137,
      "learning_rate": 6.329113924050633e-06,
      "loss": 0.5751,
      "step": 5
    },
    {
      "epoch": 0.006339144215530904,
      "grad_norm": 0.44943947868261674,
      "learning_rate": 1.2658227848101267e-05,
      "loss": 0.6355,
      "step": 10
    },
    {
      "epoch": 0.009508716323296355,
      "grad_norm": 0.4388677072263688,
      "learning_rate": 1.89873417721519e-05,
      "loss": 0.5644,
      "step": 15
    },
    {
      "epoch": 0.012678288431061807,
      "grad_norm": 0.42271873762000256,
      "learning_rate": 2.5316455696202533e-05,
      "loss": 0.6074,
      "step": 20
    },
    {
      "epoch": 0.01584786053882726,
      "grad_norm": 0.394168038759961,
      "learning_rate": 3.1645569620253167e-05,
      "loss": 0.5618,
      "step": 25
    },
    {
      "epoch": 0.01901743264659271,
      "grad_norm": 0.3572253194331872,
      "learning_rate": 3.79746835443038e-05,
      "loss": 0.3782,
      "step": 30
    },
    {
      "epoch": 0.022187004754358162,
      "grad_norm": 0.412555556410598,
      "learning_rate": 4.430379746835443e-05,
      "loss": 0.5004,
      "step": 35
    },
    {
      "epoch": 0.025356576862123614,
      "grad_norm": 0.3794217410787016,
      "learning_rate": 5.0632911392405066e-05,
      "loss": 0.4466,
      "step": 40
    },
    {
      "epoch": 0.028526148969889066,
      "grad_norm": 0.35048440162730615,
      "learning_rate": 5.69620253164557e-05,
      "loss": 0.4673,
      "step": 45
    },
    {
      "epoch": 0.03169572107765452,
      "grad_norm": 0.5944470916433041,
      "learning_rate": 6.329113924050633e-05,
      "loss": 0.4321,
      "step": 50
    },
    {
      "epoch": 0.03486529318541997,
      "grad_norm": 0.42565747209046223,
      "learning_rate": 6.962025316455697e-05,
      "loss": 0.3458,
      "step": 55
    },
    {
      "epoch": 0.03803486529318542,
      "grad_norm": 0.4086754614850085,
      "learning_rate": 7.59493670886076e-05,
      "loss": 0.3853,
      "step": 60
    },
    {
      "epoch": 0.04120443740095087,
      "grad_norm": 0.4100402315211435,
      "learning_rate": 8.227848101265824e-05,
      "loss": 0.3248,
      "step": 65
    },
    {
      "epoch": 0.044374009508716325,
      "grad_norm": 0.439691902474589,
      "learning_rate": 8.860759493670887e-05,
      "loss": 0.3902,
      "step": 70
    },
    {
      "epoch": 0.04754358161648178,
      "grad_norm": 0.4789983229510449,
      "learning_rate": 9.493670886075949e-05,
      "loss": 0.4375,
      "step": 75
    },
    {
      "epoch": 0.05071315372424723,
      "grad_norm": 0.4326295423185557,
      "learning_rate": 0.00010126582278481013,
      "loss": 0.344,
      "step": 80
    },
    {
      "epoch": 0.05388272583201268,
      "grad_norm": 0.4082745764246507,
      "learning_rate": 0.00010759493670886076,
      "loss": 0.373,
      "step": 85
    },
    {
      "epoch": 0.05705229793977813,
      "grad_norm": 0.44917869809554406,
      "learning_rate": 0.0001139240506329114,
      "loss": 0.3366,
      "step": 90
    },
    {
      "epoch": 0.060221870047543584,
      "grad_norm": 0.4099898854278625,
      "learning_rate": 0.00012025316455696203,
      "loss": 0.3827,
      "step": 95
    },
    {
      "epoch": 0.06339144215530904,
      "grad_norm": 0.5173612120396457,
      "learning_rate": 0.00012658227848101267,
      "loss": 0.3913,
      "step": 100
    },
    {
      "epoch": 0.06656101426307448,
      "grad_norm": 0.4695908910723305,
      "learning_rate": 0.0001329113924050633,
      "loss": 0.3285,
      "step": 105
    },
    {
      "epoch": 0.06973058637083994,
      "grad_norm": 0.34610029250066504,
      "learning_rate": 0.00013924050632911395,
      "loss": 0.3542,
      "step": 110
    },
    {
      "epoch": 0.07290015847860538,
      "grad_norm": 0.3833563232036365,
      "learning_rate": 0.00014556962025316457,
      "loss": 0.3442,
      "step": 115
    },
    {
      "epoch": 0.07606973058637084,
      "grad_norm": 0.38597736664868315,
      "learning_rate": 0.0001518987341772152,
      "loss": 0.3499,
      "step": 120
    },
    {
      "epoch": 0.07923930269413629,
      "grad_norm": 0.4555824320300245,
      "learning_rate": 0.00015822784810126583,
      "loss": 0.3843,
      "step": 125
    },
    {
      "epoch": 0.08240887480190175,
      "grad_norm": 0.44058959604469733,
      "learning_rate": 0.00016455696202531648,
      "loss": 0.3321,
      "step": 130
    },
    {
      "epoch": 0.08557844690966719,
      "grad_norm": 0.37513672150754146,
      "learning_rate": 0.0001708860759493671,
      "loss": 0.3409,
      "step": 135
    },
    {
      "epoch": 0.08874801901743265,
      "grad_norm": 0.3532888739409051,
      "learning_rate": 0.00017721518987341773,
      "loss": 0.3388,
      "step": 140
    },
    {
      "epoch": 0.0919175911251981,
      "grad_norm": 0.31398944959900404,
      "learning_rate": 0.00018354430379746836,
      "loss": 0.3407,
      "step": 145
    },
    {
      "epoch": 0.09508716323296355,
      "grad_norm": 0.48473648286443866,
      "learning_rate": 0.00018987341772151899,
      "loss": 0.4109,
      "step": 150
    },
    {
      "epoch": 0.098256735340729,
      "grad_norm": 0.3832743712760423,
      "learning_rate": 0.00019620253164556964,
      "loss": 0.2894,
      "step": 155
    },
    {
      "epoch": 0.10142630744849446,
      "grad_norm": 0.3576599310136604,
      "learning_rate": 0.00019999901968817678,
      "loss": 0.3685,
      "step": 160
    },
    {
      "epoch": 0.1045958795562599,
      "grad_norm": 0.4041268184733326,
      "learning_rate": 0.0001999879914008964,
      "loss": 0.3103,
      "step": 165
    },
    {
      "epoch": 0.10776545166402536,
      "grad_norm": 0.348710082889974,
      "learning_rate": 0.00019996471079244477,
      "loss": 0.3686,
      "step": 170
    },
    {
      "epoch": 0.1109350237717908,
      "grad_norm": 0.3641139077278622,
      "learning_rate": 0.0001999291807155794,
      "loss": 0.3672,
      "step": 175
    },
    {
      "epoch": 0.11410459587955626,
      "grad_norm": 0.34875291735749603,
      "learning_rate": 0.0001998814055240823,
      "loss": 0.3289,
      "step": 180
    },
    {
      "epoch": 0.11727416798732171,
      "grad_norm": 0.35868082118594846,
      "learning_rate": 0.00019982139107222632,
      "loss": 0.3843,
      "step": 185
    },
    {
      "epoch": 0.12044374009508717,
      "grad_norm": 0.2975053354861811,
      "learning_rate": 0.000199749144714058,
      "loss": 0.3187,
      "step": 190
    },
    {
      "epoch": 0.12361331220285261,
      "grad_norm": 0.3926097041806586,
      "learning_rate": 0.00019966467530249627,
      "loss": 0.3711,
      "step": 195
    },
    {
      "epoch": 0.12678288431061807,
      "grad_norm": 0.39235636818547276,
      "learning_rate": 0.00019956799318824776,
      "loss": 0.3599,
      "step": 200
    },
    {
      "epoch": 0.12678288431061807,
      "eval_loss": 0.31717613339424133,
      "eval_runtime": 878.4135,
      "eval_samples_per_second": 4.554,
      "eval_steps_per_second": 0.569,
      "step": 200
    },
    {
      "epoch": 0.12995245641838352,
      "grad_norm": 0.32366959300654363,
      "learning_rate": 0.00019945911021853818,
      "loss": 0.2671,
      "step": 205
    },
    {
      "epoch": 0.13312202852614896,
      "grad_norm": 0.34183927553766114,
      "learning_rate": 0.00019933803973566102,
      "loss": 0.3491,
      "step": 210
    },
    {
      "epoch": 0.13629160063391443,
      "grad_norm": 0.355629049879592,
      "learning_rate": 0.0001992047965753422,
      "loss": 0.2778,
      "step": 215
    },
    {
      "epoch": 0.13946117274167988,
      "grad_norm": 0.31194706241410036,
      "learning_rate": 0.00019905939706492238,
      "loss": 0.3278,
      "step": 220
    },
    {
      "epoch": 0.14263074484944532,
      "grad_norm": 0.37190501088914274,
      "learning_rate": 0.0001989018590213561,
      "loss": 0.3757,
      "step": 225
    },
    {
      "epoch": 0.14580031695721077,
      "grad_norm": 0.30859177154159206,
      "learning_rate": 0.00019873220174902858,
      "loss": 0.2952,
      "step": 230
    },
    {
      "epoch": 0.14896988906497624,
      "grad_norm": 0.4072493051692793,
      "learning_rate": 0.0001985504460373903,
      "loss": 0.3576,
      "step": 235
    },
    {
      "epoch": 0.15213946117274169,
      "grad_norm": 0.3117614582623609,
      "learning_rate": 0.00019835661415840928,
      "loss": 0.3127,
      "step": 240
    },
    {
      "epoch": 0.15530903328050713,
      "grad_norm": 0.3433870206019631,
      "learning_rate": 0.00019815072986384218,
      "loss": 0.3424,
      "step": 245
    },
    {
      "epoch": 0.15847860538827258,
      "grad_norm": 0.3252374107324197,
      "learning_rate": 0.0001979328183823236,
      "loss": 0.3509,
      "step": 250
    },
    {
      "epoch": 0.16164817749603805,
      "grad_norm": 0.32574757253252834,
      "learning_rate": 0.00019770290641627468,
      "loss": 0.2913,
      "step": 255
    },
    {
      "epoch": 0.1648177496038035,
      "grad_norm": 0.37343408069668577,
      "learning_rate": 0.00019746102213863114,
      "loss": 0.3524,
      "step": 260
    },
    {
      "epoch": 0.16798732171156894,
      "grad_norm": 0.30197216412790706,
      "learning_rate": 0.00019720719518939083,
      "loss": 0.295,
      "step": 265
    },
    {
      "epoch": 0.17115689381933438,
      "grad_norm": 0.37750434171669517,
      "learning_rate": 0.00019694145667198195,
      "loss": 0.3215,
      "step": 270
    },
    {
      "epoch": 0.17432646592709986,
      "grad_norm": 0.3368196048030473,
      "learning_rate": 0.0001966638391494514,
      "loss": 0.35,
      "step": 275
    },
    {
      "epoch": 0.1774960380348653,
      "grad_norm": 0.3232595651729065,
      "learning_rate": 0.0001963743766404749,
      "loss": 0.2637,
      "step": 280
    },
    {
      "epoch": 0.18066561014263074,
      "grad_norm": 0.32199548202560035,
      "learning_rate": 0.00019607310461518818,
      "loss": 0.3262,
      "step": 285
    },
    {
      "epoch": 0.1838351822503962,
      "grad_norm": 0.29117926540088634,
      "learning_rate": 0.0001957600599908406,
      "loss": 0.3129,
      "step": 290
    },
    {
      "epoch": 0.18700475435816163,
      "grad_norm": 0.2836794081153409,
      "learning_rate": 0.00019543528112727146,
      "loss": 0.3207,
      "step": 295
    },
    {
      "epoch": 0.1901743264659271,
      "grad_norm": 0.37478385305484463,
      "learning_rate": 0.0001950988078222093,
      "loss": 0.3503,
      "step": 300
    },
    {
      "epoch": 0.19334389857369255,
      "grad_norm": 0.3323790483161259,
      "learning_rate": 0.00019475068130639543,
      "loss": 0.2873,
      "step": 305
    },
    {
      "epoch": 0.196513470681458,
      "grad_norm": 0.31045326503955184,
      "learning_rate": 0.0001943909442385313,
      "loss": 0.3379,
      "step": 310
    },
    {
      "epoch": 0.19968304278922344,
      "grad_norm": 0.295428110940092,
      "learning_rate": 0.00019401964070005144,
      "loss": 0.2913,
      "step": 315
    },
    {
      "epoch": 0.20285261489698891,
      "grad_norm": 0.31381749704770145,
      "learning_rate": 0.00019363681618972164,
      "loss": 0.3167,
      "step": 320
    },
    {
      "epoch": 0.20602218700475436,
      "grad_norm": 0.3799683908480184,
      "learning_rate": 0.00019324251761806374,
      "loss": 0.3203,
      "step": 325
    },
    {
      "epoch": 0.2091917591125198,
      "grad_norm": 0.25669447806119594,
      "learning_rate": 0.00019283679330160726,
      "loss": 0.2598,
      "step": 330
    },
    {
      "epoch": 0.21236133122028525,
      "grad_norm": 0.3253285501894849,
      "learning_rate": 0.00019241969295696879,
      "loss": 0.321,
      "step": 335
    },
    {
      "epoch": 0.21553090332805072,
      "grad_norm": 0.3015776648780859,
      "learning_rate": 0.0001919912676947598,
      "loss": 0.2912,
      "step": 340
    },
    {
      "epoch": 0.21870047543581617,
      "grad_norm": 0.3548152436637532,
      "learning_rate": 0.00019155157001332374,
      "loss": 0.3398,
      "step": 345
    },
    {
      "epoch": 0.2218700475435816,
      "grad_norm": 0.3562179525646546,
      "learning_rate": 0.00019110065379230289,
      "loss": 0.3575,
      "step": 350
    },
    {
      "epoch": 0.22503961965134706,
      "grad_norm": 0.33759944051182883,
      "learning_rate": 0.00019063857428603615,
      "loss": 0.2644,
      "step": 355
    },
    {
      "epoch": 0.22820919175911253,
      "grad_norm": 0.3478332359179607,
      "learning_rate": 0.00019016538811678823,
      "loss": 0.3421,
      "step": 360
    },
    {
      "epoch": 0.23137876386687797,
      "grad_norm": 0.3107602080624315,
      "learning_rate": 0.0001896811532678113,
      "loss": 0.262,
      "step": 365
    },
    {
      "epoch": 0.23454833597464342,
      "grad_norm": 0.26971775917740104,
      "learning_rate": 0.00018918592907623985,
      "loss": 0.3378,
      "step": 370
    },
    {
      "epoch": 0.23771790808240886,
      "grad_norm": 0.32413332448217697,
      "learning_rate": 0.00018867977622581957,
      "loss": 0.3316,
      "step": 375
    },
    {
      "epoch": 0.24088748019017434,
      "grad_norm": 0.3522975093101741,
      "learning_rate": 0.00018816275673947148,
      "loss": 0.2678,
      "step": 380
    },
    {
      "epoch": 0.24405705229793978,
      "grad_norm": 0.31661852350790726,
      "learning_rate": 0.00018763493397169146,
      "loss": 0.3275,
      "step": 385
    },
    {
      "epoch": 0.24722662440570523,
      "grad_norm": 0.27090727261610936,
      "learning_rate": 0.00018709637260078729,
      "loss": 0.2858,
      "step": 390
    },
    {
      "epoch": 0.25039619651347067,
      "grad_norm": 0.3143474617991223,
      "learning_rate": 0.0001865471386209527,
      "loss": 0.3317,
      "step": 395
    },
    {
      "epoch": 0.25356576862123614,
      "grad_norm": 0.48811153855723693,
      "learning_rate": 0.000185987299334181,
      "loss": 0.3295,
      "step": 400
    },
    {
      "epoch": 0.25356576862123614,
      "eval_loss": 0.29194891452789307,
      "eval_runtime": 872.9978,
      "eval_samples_per_second": 4.582,
      "eval_steps_per_second": 0.573,
      "step": 400
    },
    {
      "epoch": 0.25673534072900156,
      "grad_norm": 0.31755342222995686,
      "learning_rate": 0.00018541692334201771,
      "loss": 0.2643,
      "step": 405
    },
    {
      "epoch": 0.25990491283676703,
      "grad_norm": 0.34778059073770806,
      "learning_rate": 0.0001848360805371544,
      "loss": 0.3339,
      "step": 410
    },
    {
      "epoch": 0.2630744849445325,
      "grad_norm": 0.3183073063986642,
      "learning_rate": 0.00018424484209486416,
      "loss": 0.2673,
      "step": 415
    },
    {
      "epoch": 0.2662440570522979,
      "grad_norm": 0.2788199901083398,
      "learning_rate": 0.00018364328046428,
      "loss": 0.3272,
      "step": 420
    },
    {
      "epoch": 0.2694136291600634,
      "grad_norm": 0.3666143727147526,
      "learning_rate": 0.00018303146935951689,
      "loss": 0.3247,
      "step": 425
    },
    {
      "epoch": 0.27258320126782887,
      "grad_norm": 0.28586548327038175,
      "learning_rate": 0.00018240948375063926,
      "loss": 0.2792,
      "step": 430
    },
    {
      "epoch": 0.2757527733755943,
      "grad_norm": 0.9727255846044429,
      "learning_rate": 0.00018177739985447412,
      "loss": 0.3485,
      "step": 435
    },
    {
      "epoch": 0.27892234548335976,
      "grad_norm": 0.29065854553956355,
      "learning_rate": 0.0001811352951252717,
      "loss": 0.2729,
      "step": 440
    },
    {
      "epoch": 0.2820919175911252,
      "grad_norm": 0.320575993183303,
      "learning_rate": 0.0001804832482452142,
      "loss": 0.3354,
      "step": 445
    },
    {
      "epoch": 0.28526148969889065,
      "grad_norm": 0.34869737354697955,
      "learning_rate": 0.0001798213391147746,
      "loss": 0.3385,
      "step": 450
    },
    {
      "epoch": 0.2884310618066561,
      "grad_norm": 0.31478642211651564,
      "learning_rate": 0.00017914964884292544,
      "loss": 0.3133,
      "step": 455
    },
    {
      "epoch": 0.29160063391442154,
      "grad_norm": 0.36834278711947965,
      "learning_rate": 0.0001784682597372,
      "loss": 0.3593,
      "step": 460
    },
    {
      "epoch": 0.294770206022187,
      "grad_norm": 0.2791902388221146,
      "learning_rate": 0.00017777725529360676,
      "loss": 0.3005,
      "step": 465
    },
    {
      "epoch": 0.2979397781299525,
      "grad_norm": 0.30096452678752406,
      "learning_rate": 0.00017707672018639758,
      "loss": 0.3354,
      "step": 470
    },
    {
      "epoch": 0.3011093502377179,
      "grad_norm": 0.3708048891578612,
      "learning_rate": 0.00017636674025769215,
      "loss": 0.3147,
      "step": 475
    },
    {
      "epoch": 0.30427892234548337,
      "grad_norm": 0.305209122691005,
      "learning_rate": 0.00017564740250695904,
      "loss": 0.2713,
      "step": 480
    },
    {
      "epoch": 0.3074484944532488,
      "grad_norm": 0.3018873391630076,
      "learning_rate": 0.0001749187950803549,
      "loss": 0.3202,
      "step": 485
    },
    {
      "epoch": 0.31061806656101426,
      "grad_norm": 0.3464422287874134,
      "learning_rate": 0.00017418100725992316,
      "loss": 0.3042,
      "step": 490
    },
    {
      "epoch": 0.31378763866877973,
      "grad_norm": 0.31036543367721087,
      "learning_rate": 0.00017343412945265382,
      "loss": 0.3105,
      "step": 495
    },
    {
      "epoch": 0.31695721077654515,
      "grad_norm": 0.3090116757558095,
      "learning_rate": 0.00017267825317940493,
      "loss": 0.3086,
      "step": 500
    },
    {
      "epoch": 0.3201267828843106,
      "grad_norm": 0.32015559999952525,
      "learning_rate": 0.00017191347106368797,
      "loss": 0.2595,
      "step": 505
    },
    {
      "epoch": 0.3232963549920761,
      "grad_norm": 0.28242640929152685,
      "learning_rate": 0.0001711398768203178,
      "loss": 0.3171,
      "step": 510
    },
    {
      "epoch": 0.3264659270998415,
      "grad_norm": 0.3373697781712397,
      "learning_rate": 0.00017035756524392924,
      "loss": 0.2897,
      "step": 515
    },
    {
      "epoch": 0.329635499207607,
      "grad_norm": 0.3187883343723006,
      "learning_rate": 0.0001695666321973609,
      "loss": 0.303,
      "step": 520
    },
    {
      "epoch": 0.3328050713153724,
      "grad_norm": 0.4060972163443389,
      "learning_rate": 0.00016876717459990862,
      "loss": 0.3273,
      "step": 525
    },
    {
      "epoch": 0.3359746434231379,
      "grad_norm": 0.2709960074426642,
      "learning_rate": 0.0001679592904154489,
      "loss": 0.2629,
      "step": 530
    },
    {
      "epoch": 0.33914421553090335,
      "grad_norm": 0.2828719972128079,
      "learning_rate": 0.00016714307864043487,
      "loss": 0.2946,
      "step": 535
    },
    {
      "epoch": 0.34231378763866877,
      "grad_norm": 0.29485357171410065,
      "learning_rate": 0.00016631863929176524,
      "loss": 0.2704,
      "step": 540
    },
    {
      "epoch": 0.34548335974643424,
      "grad_norm": 0.3140677978027709,
      "learning_rate": 0.00016548607339452853,
      "loss": 0.3211,
      "step": 545
    },
    {
      "epoch": 0.3486529318541997,
      "grad_norm": 0.30224374704766904,
      "learning_rate": 0.00016464548296962373,
      "loss": 0.3289,
      "step": 550
    },
    {
      "epoch": 0.3518225039619651,
      "grad_norm": 0.3015178734291492,
      "learning_rate": 0.0001637969710212588,
      "loss": 0.262,
      "step": 555
    },
    {
      "epoch": 0.3549920760697306,
      "grad_norm": 0.3261808476280464,
      "learning_rate": 0.00016294064152432879,
      "loss": 0.3524,
      "step": 560
    },
    {
      "epoch": 0.358161648177496,
      "grad_norm": 0.30420040263110554,
      "learning_rate": 0.00016207659941167485,
      "loss": 0.2888,
      "step": 565
    },
    {
      "epoch": 0.3613312202852615,
      "grad_norm": 0.29855740633395794,
      "learning_rate": 0.00016120495056122622,
      "loss": 0.3075,
      "step": 570
    },
    {
      "epoch": 0.36450079239302696,
      "grad_norm": 0.3775755682614953,
      "learning_rate": 0.00016032580178302583,
      "loss": 0.3452,
      "step": 575
    },
    {
      "epoch": 0.3676703645007924,
      "grad_norm": 0.3189277602131783,
      "learning_rate": 0.00015943926080614235,
      "loss": 0.2643,
      "step": 580
    },
    {
      "epoch": 0.37083993660855785,
      "grad_norm": 0.32115548282274786,
      "learning_rate": 0.00015854543626546915,
      "loss": 0.3126,
      "step": 585
    },
    {
      "epoch": 0.37400950871632327,
      "grad_norm": 0.29230296850863174,
      "learning_rate": 0.00015764443768841234,
      "loss": 0.2949,
      "step": 590
    },
    {
      "epoch": 0.37717908082408874,
      "grad_norm": 0.32187057297721217,
      "learning_rate": 0.0001567363754814696,
      "loss": 0.3166,
      "step": 595
    },
    {
      "epoch": 0.3803486529318542,
      "grad_norm": 0.3766752931165212,
      "learning_rate": 0.0001558213609167012,
      "loss": 0.323,
      "step": 600
    },
    {
      "epoch": 0.3803486529318542,
      "eval_loss": 0.2788923680782318,
      "eval_runtime": 873.5171,
      "eval_samples_per_second": 4.579,
      "eval_steps_per_second": 0.572,
      "step": 600
    },
    {
      "epoch": 0.38351822503961963,
      "grad_norm": 0.31877960462977273,
      "learning_rate": 0.00015489950611809484,
      "loss": 0.2803,
      "step": 605
    },
    {
      "epoch": 0.3866877971473851,
      "grad_norm": 0.2903622851026156,
      "learning_rate": 0.00015397092404782642,
      "loss": 0.3178,
      "step": 610
    },
    {
      "epoch": 0.3898573692551506,
      "grad_norm": 0.2639727101749139,
      "learning_rate": 0.00015303572849241764,
      "loss": 0.2703,
      "step": 615
    },
    {
      "epoch": 0.393026941362916,
      "grad_norm": 0.3491709894849581,
      "learning_rate": 0.00015209403404879303,
      "loss": 0.3049,
      "step": 620
    },
    {
      "epoch": 0.39619651347068147,
      "grad_norm": 0.3651420024997032,
      "learning_rate": 0.00015114595611023744,
      "loss": 0.3265,
      "step": 625
    },
    {
      "epoch": 0.3993660855784469,
      "grad_norm": 0.3071330073578763,
      "learning_rate": 0.0001501916108522558,
      "loss": 0.2645,
      "step": 630
    },
    {
      "epoch": 0.40253565768621236,
      "grad_norm": 0.2739471545543727,
      "learning_rate": 0.00014923111521833758,
      "loss": 0.3035,
      "step": 635
    },
    {
      "epoch": 0.40570522979397783,
      "grad_norm": 0.30630113259525843,
      "learning_rate": 0.00014826458690562642,
      "loss": 0.2606,
      "step": 640
    },
    {
      "epoch": 0.40887480190174325,
      "grad_norm": 0.2988843883769528,
      "learning_rate": 0.00014729214435049793,
      "loss": 0.3111,
      "step": 645
    },
    {
      "epoch": 0.4120443740095087,
      "grad_norm": 0.3110979862585215,
      "learning_rate": 0.0001463139067140468,
      "loss": 0.2948,
      "step": 650
    },
    {
      "epoch": 0.4152139461172742,
      "grad_norm": 0.30767657253531316,
      "learning_rate": 0.0001453299938674849,
      "loss": 0.2638,
      "step": 655
    },
    {
      "epoch": 0.4183835182250396,
      "grad_norm": 0.27014842841388653,
      "learning_rate": 0.00014434052637745257,
      "loss": 0.2819,
      "step": 660
    },
    {
      "epoch": 0.4215530903328051,
      "grad_norm": 0.2739393681355767,
      "learning_rate": 0.00014334562549124467,
      "loss": 0.2466,
      "step": 665
    },
    {
      "epoch": 0.4247226624405705,
      "grad_norm": 0.31758998023523244,
      "learning_rate": 0.00014234541312195323,
      "loss": 0.2873,
      "step": 670
    },
    {
      "epoch": 0.42789223454833597,
      "grad_norm": 0.39847849128188423,
      "learning_rate": 0.00014134001183352832,
      "loss": 0.2979,
      "step": 675
    },
    {
      "epoch": 0.43106180665610144,
      "grad_norm": 0.30950118355401873,
      "learning_rate": 0.00014032954482575937,
      "loss": 0.2617,
      "step": 680
    },
    {
      "epoch": 0.43423137876386686,
      "grad_norm": 0.3260587574739946,
      "learning_rate": 0.0001393141359191787,
      "loss": 0.3109,
      "step": 685
    },
    {
      "epoch": 0.43740095087163233,
      "grad_norm": 0.3114375419997854,
      "learning_rate": 0.00013829390953988853,
      "loss": 0.2845,
      "step": 690
    },
    {
      "epoch": 0.4405705229793978,
      "grad_norm": 0.30019871836883555,
      "learning_rate": 0.00013726899070431423,
      "loss": 0.324,
      "step": 695
    },
    {
      "epoch": 0.4437400950871632,
      "grad_norm": 0.38021042516470643,
      "learning_rate": 0.00013623950500388506,
      "loss": 0.3269,
      "step": 700
    },
    {
      "epoch": 0.4469096671949287,
      "grad_norm": 0.3089060241706131,
      "learning_rate": 0.00013520557858964446,
      "loss": 0.2584,
      "step": 705
    },
    {
      "epoch": 0.4500792393026941,
      "grad_norm": 0.27984586622582663,
      "learning_rate": 0.00013416733815679166,
      "loss": 0.2909,
      "step": 710
    },
    {
      "epoch": 0.4532488114104596,
      "grad_norm": 0.2923559292409706,
      "learning_rate": 0.00013312491092915682,
      "loss": 0.2489,
      "step": 715
    },
    {
      "epoch": 0.45641838351822506,
      "grad_norm": 0.29223045315786345,
      "learning_rate": 0.00013207842464361125,
      "loss": 0.3135,
      "step": 720
    },
    {
      "epoch": 0.4595879556259905,
      "grad_norm": 0.33907899924090856,
      "learning_rate": 0.00013102800753441487,
      "loss": 0.3148,
      "step": 725
    },
    {
      "epoch": 0.46275752773375595,
      "grad_norm": 0.26110455456342696,
      "learning_rate": 0.00012997378831750242,
      "loss": 0.2505,
      "step": 730
    },
    {
      "epoch": 0.4659270998415214,
      "grad_norm": 0.2855563878095534,
      "learning_rate": 0.00012891589617471122,
      "loss": 0.322,
      "step": 735
    },
    {
      "epoch": 0.46909667194928684,
      "grad_norm": 0.27089962197787903,
      "learning_rate": 0.00012785446073795118,
      "loss": 0.2629,
      "step": 740
    },
    {
      "epoch": 0.4722662440570523,
      "grad_norm": 0.2787588891548799,
      "learning_rate": 0.00012678961207332015,
      "loss": 0.3071,
      "step": 745
    },
    {
      "epoch": 0.4754358161648177,
      "grad_norm": 0.35249049637057156,
      "learning_rate": 0.00012572148066516584,
      "loss": 0.3265,
      "step": 750
    },
    {
      "epoch": 0.4786053882725832,
      "grad_norm": 0.33307560406452336,
      "learning_rate": 0.00012465019740009662,
      "loss": 0.2403,
      "step": 755
    },
    {
      "epoch": 0.48177496038034867,
      "grad_norm": 0.3035753509057755,
      "learning_rate": 0.00012357589355094275,
      "loss": 0.3057,
      "step": 760
    },
    {
      "epoch": 0.4849445324881141,
      "grad_norm": 0.2950972689886197,
      "learning_rate": 0.00012249870076067067,
      "loss": 0.2637,
      "step": 765
    },
    {
      "epoch": 0.48811410459587956,
      "grad_norm": 0.2713040409786771,
      "learning_rate": 0.00012141875102625167,
      "loss": 0.3196,
      "step": 770
    },
    {
      "epoch": 0.49128367670364503,
      "grad_norm": 0.37005187803966516,
      "learning_rate": 0.00012033617668248723,
      "loss": 0.3265,
      "step": 775
    },
    {
      "epoch": 0.49445324881141045,
      "grad_norm": 0.3678796577106568,
      "learning_rate": 0.00011925111038579309,
      "loss": 0.2283,
      "step": 780
    },
    {
      "epoch": 0.4976228209191759,
      "grad_norm": 0.3021844529595635,
      "learning_rate": 0.00011816368509794364,
      "loss": 0.2967,
      "step": 785
    },
    {
      "epoch": 0.5007923930269413,
      "grad_norm": 0.3028161473676034,
      "learning_rate": 0.00011707403406977928,
      "loss": 0.2841,
      "step": 790
    },
    {
      "epoch": 0.5039619651347068,
      "grad_norm": 0.27418964538735746,
      "learning_rate": 0.00011598229082487784,
      "loss": 0.2803,
      "step": 795
    },
    {
      "epoch": 0.5071315372424723,
      "grad_norm": 0.3426638434156249,
      "learning_rate": 0.0001148885891431932,
      "loss": 0.3274,
      "step": 800
    },
    {
      "epoch": 0.5071315372424723,
      "eval_loss": 0.26855266094207764,
      "eval_runtime": 873.628,
      "eval_samples_per_second": 4.579,
      "eval_steps_per_second": 0.572,
      "step": 800
    },
    {
      "epoch": 0.5103011093502378,
      "grad_norm": 0.2681269338020656,
      "learning_rate": 0.00011379306304466198,
      "loss": 0.2381,
      "step": 805
    },
    {
      "epoch": 0.5134706814580031,
      "grad_norm": 0.2987060218422062,
      "learning_rate": 0.00011269584677278102,
      "loss": 0.3076,
      "step": 810
    },
    {
      "epoch": 0.5166402535657686,
      "grad_norm": 0.2804222341073312,
      "learning_rate": 0.00011159707477815755,
      "loss": 0.2395,
      "step": 815
    },
    {
      "epoch": 0.5198098256735341,
      "grad_norm": 0.25835895356413513,
      "learning_rate": 0.00011049688170203383,
      "loss": 0.3041,
      "step": 820
    },
    {
      "epoch": 0.5229793977812995,
      "grad_norm": 0.3313190058494361,
      "learning_rate": 0.00010939540235978845,
      "loss": 0.297,
      "step": 825
    },
    {
      "epoch": 0.526148969889065,
      "grad_norm": 0.2564972143294916,
      "learning_rate": 0.00010829277172441648,
      "loss": 0.2359,
      "step": 830
    },
    {
      "epoch": 0.5293185419968305,
      "grad_norm": 0.31632766018739716,
      "learning_rate": 0.00010718912490998991,
      "loss": 0.3112,
      "step": 835
    },
    {
      "epoch": 0.5324881141045958,
      "grad_norm": 0.2738970193614327,
      "learning_rate": 0.00010608459715510139,
      "loss": 0.2416,
      "step": 840
    },
    {
      "epoch": 0.5356576862123613,
      "grad_norm": 0.35306801364530893,
      "learning_rate": 0.00010497932380629207,
      "loss": 0.3334,
      "step": 845
    },
    {
      "epoch": 0.5388272583201268,
      "grad_norm": 0.3617753781992424,
      "learning_rate": 0.00010387344030146665,
      "loss": 0.3071,
      "step": 850
    },
    {
      "epoch": 0.5419968304278923,
      "grad_norm": 0.284695185318866,
      "learning_rate": 0.0001027670821532971,
      "loss": 0.2516,
      "step": 855
    },
    {
      "epoch": 0.5451664025356577,
      "grad_norm": 0.28641499966999695,
      "learning_rate": 0.00010166038493261722,
      "loss": 0.3268,
      "step": 860
    },
    {
      "epoch": 0.5483359746434231,
      "grad_norm": 0.29940254299061986,
      "learning_rate": 0.00010055348425181,
      "loss": 0.2667,
      "step": 865
    },
    {
      "epoch": 0.5515055467511886,
      "grad_norm": 0.33784906825030664,
      "learning_rate": 9.944651574819003e-05,
      "loss": 0.3006,
      "step": 870
    },
    {
      "epoch": 0.554675118858954,
      "grad_norm": 0.33800198210916443,
      "learning_rate": 9.83396150673828e-05,
      "loss": 0.3009,
      "step": 875
    },
    {
      "epoch": 0.5578446909667195,
      "grad_norm": 0.27814752259908526,
      "learning_rate": 9.72329178467029e-05,
      "loss": 0.25,
      "step": 880
    },
    {
      "epoch": 0.561014263074485,
      "grad_norm": 0.3120985607406773,
      "learning_rate": 9.612655969853336e-05,
      "loss": 0.3079,
      "step": 885
    },
    {
      "epoch": 0.5641838351822503,
      "grad_norm": 0.32270045792226343,
      "learning_rate": 9.502067619370794e-05,
      "loss": 0.2465,
      "step": 890
    },
    {
      "epoch": 0.5673534072900158,
      "grad_norm": 0.2522429392869884,
      "learning_rate": 9.391540284489862e-05,
      "loss": 0.3049,
      "step": 895
    },
    {
      "epoch": 0.5705229793977813,
      "grad_norm": 0.32479021947356745,
      "learning_rate": 9.281087509001011e-05,
      "loss": 0.3109,
      "step": 900
    },
    {
      "epoch": 0.5736925515055468,
      "grad_norm": 0.3071871099500722,
      "learning_rate": 9.170722827558358e-05,
      "loss": 0.2566,
      "step": 905
    },
    {
      "epoch": 0.5768621236133122,
      "grad_norm": 0.2808358292017096,
      "learning_rate": 9.060459764021156e-05,
      "loss": 0.2981,
      "step": 910
    },
    {
      "epoch": 0.5800316957210776,
      "grad_norm": 0.36613518181258947,
      "learning_rate": 8.950311829796619e-05,
      "loss": 0.2812,
      "step": 915
    },
    {
      "epoch": 0.5832012678288431,
      "grad_norm": 0.29120302112196544,
      "learning_rate": 8.840292522184247e-05,
      "loss": 0.2958,
      "step": 920
    },
    {
      "epoch": 0.5863708399366085,
      "grad_norm": 0.3008146054202439,
      "learning_rate": 8.730415322721897e-05,
      "loss": 0.3119,
      "step": 925
    },
    {
      "epoch": 0.589540412044374,
      "grad_norm": 0.30809505125548203,
      "learning_rate": 8.620693695533803e-05,
      "loss": 0.2603,
      "step": 930
    },
    {
      "epoch": 0.5927099841521395,
      "grad_norm": 0.3464042931932695,
      "learning_rate": 8.511141085680683e-05,
      "loss": 0.3217,
      "step": 935
    },
    {
      "epoch": 0.595879556259905,
      "grad_norm": 0.28395404105986655,
      "learning_rate": 8.401770917512221e-05,
      "loss": 0.2339,
      "step": 940
    },
    {
      "epoch": 0.5990491283676703,
      "grad_norm": 0.32456815689823176,
      "learning_rate": 8.292596593022075e-05,
      "loss": 0.2761,
      "step": 945
    },
    {
      "epoch": 0.6022187004754358,
      "grad_norm": 0.35814205267620147,
      "learning_rate": 8.183631490205637e-05,
      "loss": 0.3064,
      "step": 950
    },
    {
      "epoch": 0.6053882725832013,
      "grad_norm": 0.3307025804465351,
      "learning_rate": 8.074888961420695e-05,
      "loss": 0.2317,
      "step": 955
    },
    {
      "epoch": 0.6085578446909667,
      "grad_norm": 0.3035093202164917,
      "learning_rate": 7.966382331751277e-05,
      "loss": 0.3024,
      "step": 960
    },
    {
      "epoch": 0.6117274167987322,
      "grad_norm": 0.23483953416505404,
      "learning_rate": 7.858124897374837e-05,
      "loss": 0.2616,
      "step": 965
    },
    {
      "epoch": 0.6148969889064976,
      "grad_norm": 0.24795445024402282,
      "learning_rate": 7.750129923932939e-05,
      "loss": 0.2889,
      "step": 970
    },
    {
      "epoch": 0.618066561014263,
      "grad_norm": 0.39470726118892546,
      "learning_rate": 7.642410644905726e-05,
      "loss": 0.3255,
      "step": 975
    },
    {
      "epoch": 0.6212361331220285,
      "grad_norm": 0.28578857562483734,
      "learning_rate": 7.534980259990341e-05,
      "loss": 0.2177,
      "step": 980
    },
    {
      "epoch": 0.624405705229794,
      "grad_norm": 0.293120691065387,
      "learning_rate": 7.427851933483418e-05,
      "loss": 0.3008,
      "step": 985
    },
    {
      "epoch": 0.6275752773375595,
      "grad_norm": 0.28050824031198807,
      "learning_rate": 7.321038792667987e-05,
      "loss": 0.2617,
      "step": 990
    },
    {
      "epoch": 0.6307448494453248,
      "grad_norm": 0.3421819179459905,
      "learning_rate": 7.214553926204883e-05,
      "loss": 0.2827,
      "step": 995
    },
    {
      "epoch": 0.6339144215530903,
      "grad_norm": 0.3825000717076991,
      "learning_rate": 7.108410382528879e-05,
      "loss": 0.3171,
      "step": 1000
    },
    {
      "epoch": 0.6339144215530903,
      "eval_loss": 0.2597305178642273,
      "eval_runtime": 873.3574,
      "eval_samples_per_second": 4.58,
      "eval_steps_per_second": 0.573,
      "step": 1000
    },
    {
      "epoch": 0.6370839936608558,
      "grad_norm": 0.293460396656183,
      "learning_rate": 7.002621168249759e-05,
      "loss": 0.2297,
      "step": 1005
    },
    {
      "epoch": 0.6402535657686212,
      "grad_norm": 0.3006160040194,
      "learning_rate": 6.897199246558514e-05,
      "loss": 0.2956,
      "step": 1010
    },
    {
      "epoch": 0.6434231378763867,
      "grad_norm": 0.2791223126874652,
      "learning_rate": 6.792157535638874e-05,
      "loss": 0.2496,
      "step": 1015
    },
    {
      "epoch": 0.6465927099841522,
      "grad_norm": 0.2894662197144813,
      "learning_rate": 6.687508907084319e-05,
      "loss": 0.2866,
      "step": 1020
    },
    {
      "epoch": 0.6497622820919176,
      "grad_norm": 0.33156274133370534,
      "learning_rate": 6.583266184320836e-05,
      "loss": 0.32,
      "step": 1025
    },
    {
      "epoch": 0.652931854199683,
      "grad_norm": 0.3447301699746775,
      "learning_rate": 6.479442141035556e-05,
      "loss": 0.2555,
      "step": 1030
    },
    {
      "epoch": 0.6561014263074485,
      "grad_norm": 0.3019937172628048,
      "learning_rate": 6.376049499611496e-05,
      "loss": 0.2632,
      "step": 1035
    },
    {
      "epoch": 0.659270998415214,
      "grad_norm": 0.25047087286035274,
      "learning_rate": 6.273100929568578e-05,
      "loss": 0.2472,
      "step": 1040
    },
    {
      "epoch": 0.6624405705229794,
      "grad_norm": 0.31801398649186896,
      "learning_rate": 6.170609046011151e-05,
      "loss": 0.2793,
      "step": 1045
    },
    {
      "epoch": 0.6656101426307448,
      "grad_norm": 0.3464523898432614,
      "learning_rate": 6.068586408082133e-05,
      "loss": 0.3138,
      "step": 1050
    },
    {
      "epoch": 0.6687797147385103,
      "grad_norm": 0.2919062799416737,
      "learning_rate": 5.9670455174240614e-05,
      "loss": 0.2427,
      "step": 1055
    },
    {
      "epoch": 0.6719492868462758,
      "grad_norm": 0.29267872629520425,
      "learning_rate": 5.865998816647171e-05,
      "loss": 0.3038,
      "step": 1060
    },
    {
      "epoch": 0.6751188589540412,
      "grad_norm": 0.27361822239828004,
      "learning_rate": 5.765458687804679e-05,
      "loss": 0.2566,
      "step": 1065
    },
    {
      "epoch": 0.6782884310618067,
      "grad_norm": 0.3050132066017946,
      "learning_rate": 5.665437450875534e-05,
      "loss": 0.2752,
      "step": 1070
    },
    {
      "epoch": 0.6814580031695721,
      "grad_norm": 0.3580338711915158,
      "learning_rate": 5.565947362254746e-05,
      "loss": 0.3331,
      "step": 1075
    },
    {
      "epoch": 0.6846275752773375,
      "grad_norm": 0.26747930377415474,
      "learning_rate": 5.467000613251516e-05,
      "loss": 0.2429,
      "step": 1080
    },
    {
      "epoch": 0.687797147385103,
      "grad_norm": 0.32226567868782413,
      "learning_rate": 5.368609328595323e-05,
      "loss": 0.3208,
      "step": 1085
    },
    {
      "epoch": 0.6909667194928685,
      "grad_norm": 0.27314417996148593,
      "learning_rate": 5.270785564950208e-05,
      "loss": 0.2351,
      "step": 1090
    },
    {
      "epoch": 0.694136291600634,
      "grad_norm": 0.31179553442460595,
      "learning_rate": 5.1735413094373594e-05,
      "loss": 0.2791,
      "step": 1095
    },
    {
      "epoch": 0.6973058637083994,
      "grad_norm": 0.2983027582550753,
      "learning_rate": 5.0768884781662465e-05,
      "loss": 0.3123,
      "step": 1100
    },
    {
      "epoch": 0.7004754358161648,
      "grad_norm": 0.268619063810808,
      "learning_rate": 4.9808389147744195e-05,
      "loss": 0.2675,
      "step": 1105
    },
    {
      "epoch": 0.7036450079239303,
      "grad_norm": 0.34151620569667657,
      "learning_rate": 4.885404388976261e-05,
      "loss": 0.3171,
      "step": 1110
    },
    {
      "epoch": 0.7068145800316957,
      "grad_norm": 0.25963093128586956,
      "learning_rate": 4.790596595120699e-05,
      "loss": 0.2533,
      "step": 1115
    },
    {
      "epoch": 0.7099841521394612,
      "grad_norm": 0.3373621924020373,
      "learning_rate": 4.696427150758238e-05,
      "loss": 0.3017,
      "step": 1120
    },
    {
      "epoch": 0.7131537242472267,
      "grad_norm": 0.32633352666577314,
      "learning_rate": 4.6029075952173596e-05,
      "loss": 0.3052,
      "step": 1125
    },
    {
      "epoch": 0.716323296354992,
      "grad_norm": 0.24971258370165642,
      "learning_rate": 4.510049388190518e-05,
      "loss": 0.2044,
      "step": 1130
    },
    {
      "epoch": 0.7194928684627575,
      "grad_norm": 0.29602844393415106,
      "learning_rate": 4.417863908329884e-05,
      "loss": 0.2959,
      "step": 1135
    },
    {
      "epoch": 0.722662440570523,
      "grad_norm": 0.23146594836780063,
      "learning_rate": 4.32636245185304e-05,
      "loss": 0.2252,
      "step": 1140
    },
    {
      "epoch": 0.7258320126782885,
      "grad_norm": 0.2744736835188008,
      "learning_rate": 4.235556231158765e-05,
      "loss": 0.2884,
      "step": 1145
    },
    {
      "epoch": 0.7290015847860539,
      "grad_norm": 0.27538990975844047,
      "learning_rate": 4.145456373453087e-05,
      "loss": 0.2981,
      "step": 1150
    },
    {
      "epoch": 0.7321711568938193,
      "grad_norm": 0.3032208366026702,
      "learning_rate": 4.0560739193857625e-05,
      "loss": 0.2158,
      "step": 1155
    },
    {
      "epoch": 0.7353407290015848,
      "grad_norm": 0.27204457210068295,
      "learning_rate": 3.96741982169742e-05,
      "loss": 0.3028,
      "step": 1160
    },
    {
      "epoch": 0.7385103011093502,
      "grad_norm": 0.28301662262727184,
      "learning_rate": 3.8795049438773825e-05,
      "loss": 0.2946,
      "step": 1165
    },
    {
      "epoch": 0.7416798732171157,
      "grad_norm": 0.2884264535746388,
      "learning_rate": 3.7923400588325155e-05,
      "loss": 0.3015,
      "step": 1170
    },
    {
      "epoch": 0.7448494453248812,
      "grad_norm": 0.3186549926460967,
      "learning_rate": 3.7059358475671224e-05,
      "loss": 0.2773,
      "step": 1175
    },
    {
      "epoch": 0.7480190174326465,
      "grad_norm": 0.2997708530371057,
      "learning_rate": 3.6203028978741226e-05,
      "loss": 0.2469,
      "step": 1180
    },
    {
      "epoch": 0.751188589540412,
      "grad_norm": 0.32430776300917263,
      "learning_rate": 3.535451703037626e-05,
      "loss": 0.2726,
      "step": 1185
    },
    {
      "epoch": 0.7543581616481775,
      "grad_norm": 0.2946578935656507,
      "learning_rate": 3.45139266054715e-05,
      "loss": 0.2645,
      "step": 1190
    },
    {
      "epoch": 0.757527733755943,
      "grad_norm": 0.26638481808591286,
      "learning_rate": 3.368136070823478e-05,
      "loss": 0.2465,
      "step": 1195
    },
    {
      "epoch": 0.7606973058637084,
      "grad_norm": 0.3677636374426017,
      "learning_rate": 3.285692135956515e-05,
      "loss": 0.3034,
      "step": 1200
    },
    {
      "epoch": 0.7606973058637084,
      "eval_loss": 0.2539891302585602,
      "eval_runtime": 873.4669,
      "eval_samples_per_second": 4.579,
      "eval_steps_per_second": 0.572,
      "step": 1200
    },
    {
      "epoch": 0.7638668779714739,
      "grad_norm": 0.29762017072344943,
      "learning_rate": 3.2040709584551095e-05,
      "loss": 0.2547,
      "step": 1205
    },
    {
      "epoch": 0.7670364500792393,
      "grad_norm": 0.35066724794986226,
      "learning_rate": 3.123282540009139e-05,
      "loss": 0.3043,
      "step": 1210
    },
    {
      "epoch": 0.7702060221870047,
      "grad_norm": 0.27108651599825634,
      "learning_rate": 3.0433367802639112e-05,
      "loss": 0.2195,
      "step": 1215
    },
    {
      "epoch": 0.7733755942947702,
      "grad_norm": 0.24030479810127725,
      "learning_rate": 2.9642434756070793e-05,
      "loss": 0.2545,
      "step": 1220
    },
    {
      "epoch": 0.7765451664025357,
      "grad_norm": 0.288327556838552,
      "learning_rate": 2.8860123179682242e-05,
      "loss": 0.2942,
      "step": 1225
    },
    {
      "epoch": 0.7797147385103012,
      "grad_norm": 0.29997783643544385,
      "learning_rate": 2.8086528936312073e-05,
      "loss": 0.2407,
      "step": 1230
    },
    {
      "epoch": 0.7828843106180665,
      "grad_norm": 0.2665313932594352,
      "learning_rate": 2.7321746820595086e-05,
      "loss": 0.2863,
      "step": 1235
    },
    {
      "epoch": 0.786053882725832,
      "grad_norm": 0.24138106294481415,
      "learning_rate": 2.6565870547346196e-05,
      "loss": 0.2443,
      "step": 1240
    },
    {
      "epoch": 0.7892234548335975,
      "grad_norm": 0.27410565336257203,
      "learning_rate": 2.5818992740076873e-05,
      "loss": 0.2714,
      "step": 1245
    },
    {
      "epoch": 0.7923930269413629,
      "grad_norm": 0.3607807135248553,
      "learning_rate": 2.508120491964512e-05,
      "loss": 0.3131,
      "step": 1250
    },
    {
      "epoch": 0.7955625990491284,
      "grad_norm": 0.2752324746545014,
      "learning_rate": 2.435259749304096e-05,
      "loss": 0.2352,
      "step": 1255
    },
    {
      "epoch": 0.7987321711568938,
      "grad_norm": 0.33701412326580854,
      "learning_rate": 2.3633259742307844e-05,
      "loss": 0.3121,
      "step": 1260
    },
    {
      "epoch": 0.8019017432646592,
      "grad_norm": 0.2719696587030905,
      "learning_rate": 2.292327981360245e-05,
      "loss": 0.2569,
      "step": 1265
    },
    {
      "epoch": 0.8050713153724247,
      "grad_norm": 0.321470064394813,
      "learning_rate": 2.222274470639324e-05,
      "loss": 0.2903,
      "step": 1270
    },
    {
      "epoch": 0.8082408874801902,
      "grad_norm": 0.33376441935823614,
      "learning_rate": 2.1531740262800004e-05,
      "loss": 0.2712,
      "step": 1275
    },
    {
      "epoch": 0.8114104595879557,
      "grad_norm": 0.3559808478292093,
      "learning_rate": 2.0850351157074598e-05,
      "loss": 0.2485,
      "step": 1280
    },
    {
      "epoch": 0.8145800316957211,
      "grad_norm": 0.3006799560470683,
      "learning_rate": 2.017866088522541e-05,
      "loss": 0.2735,
      "step": 1285
    },
    {
      "epoch": 0.8177496038034865,
      "grad_norm": 0.27868991819615774,
      "learning_rate": 1.951675175478579e-05,
      "loss": 0.2479,
      "step": 1290
    },
    {
      "epoch": 0.820919175911252,
      "grad_norm": 0.30796745550467525,
      "learning_rate": 1.8864704874728346e-05,
      "loss": 0.2693,
      "step": 1295
    },
    {
      "epoch": 0.8240887480190174,
      "grad_norm": 0.327384705590186,
      "learning_rate": 1.822260014552587e-05,
      "loss": 0.2787,
      "step": 1300
    },
    {
      "epoch": 0.8272583201267829,
      "grad_norm": 0.2993843751525639,
      "learning_rate": 1.7590516249360754e-05,
      "loss": 0.2455,
      "step": 1305
    },
    {
      "epoch": 0.8304278922345484,
      "grad_norm": 0.2979918507317238,
      "learning_rate": 1.6968530640483127e-05,
      "loss": 0.2889,
      "step": 1310
    },
    {
      "epoch": 0.8335974643423137,
      "grad_norm": 0.2942240760065363,
      "learning_rate": 1.6356719535720056e-05,
      "loss": 0.2557,
      "step": 1315
    },
    {
      "epoch": 0.8367670364500792,
      "grad_norm": 0.31698805935759067,
      "learning_rate": 1.5755157905135843e-05,
      "loss": 0.2842,
      "step": 1320
    },
    {
      "epoch": 0.8399366085578447,
      "grad_norm": 0.3795639487558114,
      "learning_rate": 1.5163919462845622e-05,
      "loss": 0.2979,
      "step": 1325
    },
    {
      "epoch": 0.8431061806656102,
      "grad_norm": 0.2933950396246441,
      "learning_rate": 1.4583076657982297e-05,
      "loss": 0.2291,
      "step": 1330
    },
    {
      "epoch": 0.8462757527733756,
      "grad_norm": 0.25934135222761445,
      "learning_rate": 1.401270066581899e-05,
      "loss": 0.2981,
      "step": 1335
    },
    {
      "epoch": 0.849445324881141,
      "grad_norm": 0.2512793866151091,
      "learning_rate": 1.3452861379047287e-05,
      "loss": 0.2299,
      "step": 1340
    },
    {
      "epoch": 0.8526148969889065,
      "grad_norm": 0.27890392188122143,
      "learning_rate": 1.2903627399212747e-05,
      "loss": 0.2714,
      "step": 1345
    },
    {
      "epoch": 0.8557844690966719,
      "grad_norm": 0.3540435753559853,
      "learning_rate": 1.2365066028308547e-05,
      "loss": 0.3208,
      "step": 1350
    },
    {
      "epoch": 0.8589540412044374,
      "grad_norm": 0.3170188652169802,
      "learning_rate": 1.183724326052854e-05,
      "loss": 0.261,
      "step": 1355
    },
    {
      "epoch": 0.8621236133122029,
      "grad_norm": 0.287259110452561,
      "learning_rate": 1.1320223774180428e-05,
      "loss": 0.2918,
      "step": 1360
    },
    {
      "epoch": 0.8652931854199684,
      "grad_norm": 0.3145063929825825,
      "learning_rate": 1.0814070923760178e-05,
      "loss": 0.2562,
      "step": 1365
    },
    {
      "epoch": 0.8684627575277337,
      "grad_norm": 0.29883537499670176,
      "learning_rate": 1.0318846732188737e-05,
      "loss": 0.2585,
      "step": 1370
    },
    {
      "epoch": 0.8716323296354992,
      "grad_norm": 0.33602754178177113,
      "learning_rate": 9.834611883211797e-06,
      "loss": 0.303,
      "step": 1375
    },
    {
      "epoch": 0.8748019017432647,
      "grad_norm": 0.27917699955310804,
      "learning_rate": 9.361425713963878e-06,
      "loss": 0.2399,
      "step": 1380
    },
    {
      "epoch": 0.8779714738510301,
      "grad_norm": 0.29322424380757633,
      "learning_rate": 8.899346207697134e-06,
      "loss": 0.3192,
      "step": 1385
    },
    {
      "epoch": 0.8811410459587956,
      "grad_norm": 0.32716078301472046,
      "learning_rate": 8.448429986676298e-06,
      "loss": 0.256,
      "step": 1390
    },
    {
      "epoch": 0.884310618066561,
      "grad_norm": 0.28468261231564157,
      "learning_rate": 8.00873230524023e-06,
      "loss": 0.2864,
      "step": 1395
    },
    {
      "epoch": 0.8874801901743264,
      "grad_norm": 0.3481974787604397,
      "learning_rate": 7.580307043031232e-06,
      "loss": 0.265,
      "step": 1400
    },
    {
      "epoch": 0.8874801901743264,
      "eval_loss": 0.25099214911460876,
      "eval_runtime": 873.7854,
      "eval_samples_per_second": 4.578,
      "eval_steps_per_second": 0.572,
      "step": 1400
    },
    {
      "epoch": 0.8906497622820919,
      "grad_norm": 0.2756744352775957,
      "learning_rate": 7.163206698392744e-06,
      "loss": 0.2392,
      "step": 1405
    },
    {
      "epoch": 0.8938193343898574,
      "grad_norm": 0.3070714015760399,
      "learning_rate": 6.757482381936264e-06,
      "loss": 0.2722,
      "step": 1410
    },
    {
      "epoch": 0.8969889064976229,
      "grad_norm": 0.2719682030351016,
      "learning_rate": 6.36318381027835e-06,
      "loss": 0.2553,
      "step": 1415
    },
    {
      "epoch": 0.9001584786053882,
      "grad_norm": 0.30754515515844727,
      "learning_rate": 5.980359299948568e-06,
      "loss": 0.2763,
      "step": 1420
    },
    {
      "epoch": 0.9033280507131537,
      "grad_norm": 0.3599613866897873,
      "learning_rate": 5.609055761468707e-06,
      "loss": 0.2987,
      "step": 1425
    },
    {
      "epoch": 0.9064976228209192,
      "grad_norm": 0.26662442413818216,
      "learning_rate": 5.249318693604577e-06,
      "loss": 0.2632,
      "step": 1430
    },
    {
      "epoch": 0.9096671949286846,
      "grad_norm": 0.2965993748242227,
      "learning_rate": 4.901192177790692e-06,
      "loss": 0.2799,
      "step": 1435
    },
    {
      "epoch": 0.9128367670364501,
      "grad_norm": 0.2923839300339188,
      "learning_rate": 4.564718872728568e-06,
      "loss": 0.2464,
      "step": 1440
    },
    {
      "epoch": 0.9160063391442155,
      "grad_norm": 0.3004256474409844,
      "learning_rate": 4.2399400091594154e-06,
      "loss": 0.2775,
      "step": 1445
    },
    {
      "epoch": 0.919175911251981,
      "grad_norm": 0.30636844288189197,
      "learning_rate": 3.926895384811835e-06,
      "loss": 0.2917,
      "step": 1450
    },
    {
      "epoch": 0.9223454833597464,
      "grad_norm": 0.27018058178290905,
      "learning_rate": 3.625623359525099e-06,
      "loss": 0.2522,
      "step": 1455
    },
    {
      "epoch": 0.9255150554675119,
      "grad_norm": 0.3069766309513976,
      "learning_rate": 3.33616085054862e-06,
      "loss": 0.2722,
      "step": 1460
    },
    {
      "epoch": 0.9286846275752774,
      "grad_norm": 0.2673579253849767,
      "learning_rate": 3.0585433280180707e-06,
      "loss": 0.2561,
      "step": 1465
    },
    {
      "epoch": 0.9318541996830428,
      "grad_norm": 0.2688001276727079,
      "learning_rate": 2.792804810609173e-06,
      "loss": 0.2718,
      "step": 1470
    },
    {
      "epoch": 0.9350237717908082,
      "grad_norm": 0.3331860222359942,
      "learning_rate": 2.538977861368874e-06,
      "loss": 0.3163,
      "step": 1475
    },
    {
      "epoch": 0.9381933438985737,
      "grad_norm": 0.2668325932813764,
      "learning_rate": 2.2970935837253182e-06,
      "loss": 0.2393,
      "step": 1480
    },
    {
      "epoch": 0.9413629160063391,
      "grad_norm": 0.3285498156618503,
      "learning_rate": 2.0671816176764058e-06,
      "loss": 0.2862,
      "step": 1485
    },
    {
      "epoch": 0.9445324881141046,
      "grad_norm": 0.36573862269188245,
      "learning_rate": 1.8492701361578324e-06,
      "loss": 0.2447,
      "step": 1490
    },
    {
      "epoch": 0.9477020602218701,
      "grad_norm": 0.2864139944423568,
      "learning_rate": 1.6433858415907278e-06,
      "loss": 0.2777,
      "step": 1495
    },
    {
      "epoch": 0.9508716323296355,
      "grad_norm": 0.323741034773291,
      "learning_rate": 1.4495539626097288e-06,
      "loss": 0.3086,
      "step": 1500
    },
    {
      "epoch": 0.9540412044374009,
      "grad_norm": 0.2857388007026186,
      "learning_rate": 1.2677982509714415e-06,
      "loss": 0.2175,
      "step": 1505
    },
    {
      "epoch": 0.9572107765451664,
      "grad_norm": 0.2813011213045847,
      "learning_rate": 1.0981409786439355e-06,
      "loss": 0.2882,
      "step": 1510
    },
    {
      "epoch": 0.9603803486529319,
      "grad_norm": 0.27685594779071976,
      "learning_rate": 9.40602935077639e-07,
      "loss": 0.23,
      "step": 1515
    },
    {
      "epoch": 0.9635499207606973,
      "grad_norm": 0.278082958417837,
      "learning_rate": 7.952034246577977e-07,
      "loss": 0.2814,
      "step": 1520
    },
    {
      "epoch": 0.9667194928684627,
      "grad_norm": 0.332411253150925,
      "learning_rate": 6.619602643389899e-07,
      "loss": 0.2772,
      "step": 1525
    },
    {
      "epoch": 0.9698890649762282,
      "grad_norm": 0.28541188324654354,
      "learning_rate": 5.408897814618175e-07,
      "loss": 0.2456,
      "step": 1530
    },
    {
      "epoch": 0.9730586370839936,
      "grad_norm": 0.289051402982161,
      "learning_rate": 4.320068117522835e-07,
      "loss": 0.2659,
      "step": 1535
    },
    {
      "epoch": 0.9762282091917591,
      "grad_norm": 0.2896831822321737,
      "learning_rate": 3.35324697503725e-07,
      "loss": 0.2721,
      "step": 1540
    },
    {
      "epoch": 0.9793977812995246,
      "grad_norm": 0.31950029347694936,
      "learning_rate": 2.508552859419977e-07,
      "loss": 0.2622,
      "step": 1545
    },
    {
      "epoch": 0.9825673534072901,
      "grad_norm": 0.33661523682392047,
      "learning_rate": 1.7860892777367133e-07,
      "loss": 0.2731,
      "step": 1550
    },
    {
      "epoch": 0.9857369255150554,
      "grad_norm": 0.2522879758084615,
      "learning_rate": 1.1859447591769934e-07,
      "loss": 0.2291,
      "step": 1555
    },
    {
      "epoch": 0.9889064976228209,
      "grad_norm": 0.2923729662272973,
      "learning_rate": 7.081928442057573e-08,
      "loss": 0.2972,
      "step": 1560
    },
    {
      "epoch": 0.9920760697305864,
      "grad_norm": 0.24814229174923821,
      "learning_rate": 3.5289207555233573e-08,
      "loss": 0.2586,
      "step": 1565
    },
    {
      "epoch": 0.9952456418383518,
      "grad_norm": 0.24322900794711846,
      "learning_rate": 1.2008599103618956e-08,
      "loss": 0.2751,
      "step": 1570
    },
    {
      "epoch": 0.9984152139461173,
      "grad_norm": 0.4374899080765362,
      "learning_rate": 9.803118232398768e-10,
      "loss": 0.2981,
      "step": 1575
    },
    {
      "epoch": 0.9996830427892235,
      "step": 1577,
      "total_flos": 8013042675351552.0,
      "train_loss": 0.3010184336819827,
      "train_runtime": 18281.3669,
      "train_samples_per_second": 1.381,
      "train_steps_per_second": 0.086
    }
  ],
  "logging_steps": 5,
  "max_steps": 1577,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 200,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 8013042675351552.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}