{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9978308026030369,
  "eval_steps": 58,
  "global_step": 230,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.004338394793926247,
      "grad_norm": 0.2975526452064514,
      "learning_rate": 4.000000000000001e-06,
      "loss": 3.4415,
      "step": 1
    },
    {
      "epoch": 0.004338394793926247,
      "eval_loss": 4.956099033355713,
      "eval_runtime": 43.9816,
      "eval_samples_per_second": 8.845,
      "eval_steps_per_second": 2.228,
      "step": 1
    },
    {
      "epoch": 0.008676789587852495,
      "grad_norm": 0.41739514470100403,
      "learning_rate": 8.000000000000001e-06,
      "loss": 3.5934,
      "step": 2
    },
    {
      "epoch": 0.013015184381778741,
      "grad_norm": 0.47740957140922546,
      "learning_rate": 1.2e-05,
      "loss": 3.7866,
      "step": 3
    },
    {
      "epoch": 0.01735357917570499,
      "grad_norm": 0.5908945798873901,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 3.8979,
      "step": 4
    },
    {
      "epoch": 0.021691973969631236,
      "grad_norm": 0.602057933807373,
      "learning_rate": 2e-05,
      "loss": 3.8257,
      "step": 5
    },
    {
      "epoch": 0.026030368763557483,
      "grad_norm": 0.7276618480682373,
      "learning_rate": 2.4e-05,
      "loss": 4.0716,
      "step": 6
    },
    {
      "epoch": 0.03036876355748373,
      "grad_norm": 0.7895606160163879,
      "learning_rate": 2.8000000000000003e-05,
      "loss": 4.1982,
      "step": 7
    },
    {
      "epoch": 0.03470715835140998,
      "grad_norm": 0.9524717926979065,
      "learning_rate": 3.2000000000000005e-05,
      "loss": 4.1916,
      "step": 8
    },
    {
      "epoch": 0.039045553145336226,
      "grad_norm": 0.9786620736122131,
      "learning_rate": 3.6e-05,
      "loss": 4.0969,
      "step": 9
    },
    {
      "epoch": 0.04338394793926247,
      "grad_norm": 1.0913058519363403,
      "learning_rate": 4e-05,
      "loss": 4.3758,
      "step": 10
    },
    {
      "epoch": 0.04772234273318872,
      "grad_norm": 1.2519152164459229,
      "learning_rate": 4.4000000000000006e-05,
      "loss": 4.3174,
      "step": 11
    },
    {
      "epoch": 0.052060737527114966,
      "grad_norm": 1.4428540468215942,
      "learning_rate": 4.8e-05,
      "loss": 4.2927,
      "step": 12
    },
    {
      "epoch": 0.05639913232104121,
      "grad_norm": 1.557953953742981,
      "learning_rate": 5.2000000000000004e-05,
      "loss": 4.3237,
      "step": 13
    },
    {
      "epoch": 0.06073752711496746,
      "grad_norm": 1.791407585144043,
      "learning_rate": 5.6000000000000006e-05,
      "loss": 4.3922,
      "step": 14
    },
    {
      "epoch": 0.0650759219088937,
      "grad_norm": 1.829128623008728,
      "learning_rate": 6e-05,
      "loss": 4.2218,
      "step": 15
    },
    {
      "epoch": 0.06941431670281996,
      "grad_norm": 1.8731590509414673,
      "learning_rate": 6.400000000000001e-05,
      "loss": 4.294,
      "step": 16
    },
    {
      "epoch": 0.0737527114967462,
      "grad_norm": 2.140212297439575,
      "learning_rate": 6.800000000000001e-05,
      "loss": 4.3197,
      "step": 17
    },
    {
      "epoch": 0.07809110629067245,
      "grad_norm": 2.5610997676849365,
      "learning_rate": 7.2e-05,
      "loss": 4.3203,
      "step": 18
    },
    {
      "epoch": 0.0824295010845987,
      "grad_norm": 2.5937764644622803,
      "learning_rate": 7.6e-05,
      "loss": 4.2128,
      "step": 19
    },
    {
      "epoch": 0.08676789587852494,
      "grad_norm": 1.9964145421981812,
      "learning_rate": 8e-05,
      "loss": 4.1141,
      "step": 20
    },
    {
      "epoch": 0.0911062906724512,
      "grad_norm": 1.9274357557296753,
      "learning_rate": 8.4e-05,
      "loss": 3.9559,
      "step": 21
    },
    {
      "epoch": 0.09544468546637744,
      "grad_norm": 2.1689515113830566,
      "learning_rate": 8.800000000000001e-05,
      "loss": 4.0459,
      "step": 22
    },
    {
      "epoch": 0.09978308026030369,
      "grad_norm": 2.417027235031128,
      "learning_rate": 9.200000000000001e-05,
      "loss": 3.8996,
      "step": 23
    },
    {
      "epoch": 0.10412147505422993,
      "grad_norm": 2.925503969192505,
      "learning_rate": 9.6e-05,
      "loss": 4.0918,
      "step": 24
    },
    {
      "epoch": 0.10845986984815618,
      "grad_norm": 4.8928961753845215,
      "learning_rate": 0.0001,
      "loss": 4.332,
      "step": 25
    },
    {
      "epoch": 0.11279826464208242,
      "grad_norm": 3.4765207767486572,
      "learning_rate": 0.00010400000000000001,
      "loss": 3.7618,
      "step": 26
    },
    {
      "epoch": 0.11713665943600868,
      "grad_norm": 3.5958409309387207,
      "learning_rate": 0.00010800000000000001,
      "loss": 3.7645,
      "step": 27
    },
    {
      "epoch": 0.12147505422993492,
      "grad_norm": 3.053165912628174,
      "learning_rate": 0.00011200000000000001,
      "loss": 3.6536,
      "step": 28
    },
    {
      "epoch": 0.12581344902386118,
      "grad_norm": 2.3347203731536865,
      "learning_rate": 0.000116,
      "loss": 3.8871,
      "step": 29
    },
    {
      "epoch": 0.1301518438177874,
      "grad_norm": 1.4141613245010376,
      "learning_rate": 0.00012,
      "loss": 3.6367,
      "step": 30
    },
    {
      "epoch": 0.13449023861171366,
      "grad_norm": 1.1042953729629517,
      "learning_rate": 0.000124,
      "loss": 3.4273,
      "step": 31
    },
    {
      "epoch": 0.13882863340563992,
      "grad_norm": 0.9391370415687561,
      "learning_rate": 0.00012800000000000002,
      "loss": 3.635,
      "step": 32
    },
    {
      "epoch": 0.14316702819956617,
      "grad_norm": 1.028341293334961,
      "learning_rate": 0.000132,
      "loss": 3.845,
      "step": 33
    },
    {
      "epoch": 0.1475054229934924,
      "grad_norm": 1.0668063163757324,
      "learning_rate": 0.00013600000000000003,
      "loss": 3.732,
      "step": 34
    },
    {
      "epoch": 0.15184381778741865,
      "grad_norm": 1.0369871854782104,
      "learning_rate": 0.00014,
      "loss": 3.6734,
      "step": 35
    },
    {
      "epoch": 0.1561822125813449,
      "grad_norm": 1.0699695348739624,
      "learning_rate": 0.000144,
      "loss": 3.5469,
      "step": 36
    },
    {
      "epoch": 0.16052060737527116,
      "grad_norm": 1.1715625524520874,
      "learning_rate": 0.000148,
      "loss": 3.5759,
      "step": 37
    },
    {
      "epoch": 0.1648590021691974,
      "grad_norm": 1.2680530548095703,
      "learning_rate": 0.000152,
      "loss": 3.7013,
      "step": 38
    },
    {
      "epoch": 0.16919739696312364,
      "grad_norm": 1.2043352127075195,
      "learning_rate": 0.00015600000000000002,
      "loss": 3.74,
      "step": 39
    },
    {
      "epoch": 0.1735357917570499,
      "grad_norm": 1.342244029045105,
      "learning_rate": 0.00016,
      "loss": 3.7761,
      "step": 40
    },
    {
      "epoch": 0.17787418655097614,
      "grad_norm": 1.4112831354141235,
      "learning_rate": 0.000164,
      "loss": 3.6449,
      "step": 41
    },
    {
      "epoch": 0.1822125813449024,
      "grad_norm": 1.3947268724441528,
      "learning_rate": 0.000168,
      "loss": 3.6043,
      "step": 42
    },
    {
      "epoch": 0.18655097613882862,
      "grad_norm": 1.5763946771621704,
      "learning_rate": 0.000172,
      "loss": 3.4768,
      "step": 43
    },
    {
      "epoch": 0.19088937093275488,
      "grad_norm": 1.9006760120391846,
      "learning_rate": 0.00017600000000000002,
      "loss": 3.6424,
      "step": 44
    },
    {
      "epoch": 0.19522776572668113,
      "grad_norm": 2.0071113109588623,
      "learning_rate": 0.00018,
      "loss": 3.7499,
      "step": 45
    },
    {
      "epoch": 0.19956616052060738,
      "grad_norm": 2.002067804336548,
      "learning_rate": 0.00018400000000000003,
      "loss": 3.6082,
      "step": 46
    },
    {
      "epoch": 0.2039045553145336,
      "grad_norm": 2.4698357582092285,
      "learning_rate": 0.000188,
      "loss": 3.7604,
      "step": 47
    },
    {
      "epoch": 0.20824295010845986,
      "grad_norm": 3.051906108856201,
      "learning_rate": 0.000192,
      "loss": 3.7546,
      "step": 48
    },
    {
      "epoch": 0.21258134490238612,
      "grad_norm": 3.100890636444092,
      "learning_rate": 0.000196,
      "loss": 3.6134,
      "step": 49
    },
    {
      "epoch": 0.21691973969631237,
      "grad_norm": 4.4481425285339355,
      "learning_rate": 0.0002,
      "loss": 3.355,
      "step": 50
    },
    {
      "epoch": 0.22125813449023862,
      "grad_norm": 4.157866954803467,
      "learning_rate": 0.00019998476951563915,
      "loss": 3.5229,
      "step": 51
    },
    {
      "epoch": 0.22559652928416485,
      "grad_norm": 5.159533500671387,
      "learning_rate": 0.0001999390827019096,
      "loss": 3.9098,
      "step": 52
    },
    {
      "epoch": 0.2299349240780911,
      "grad_norm": 4.372255325317383,
      "learning_rate": 0.0001998629534754574,
      "loss": 3.8285,
      "step": 53
    },
    {
      "epoch": 0.23427331887201736,
      "grad_norm": 2.7389180660247803,
      "learning_rate": 0.00019975640502598244,
      "loss": 3.5892,
      "step": 54
    },
    {
      "epoch": 0.2386117136659436,
      "grad_norm": 1.623792290687561,
      "learning_rate": 0.00019961946980917456,
      "loss": 3.6253,
      "step": 55
    },
    {
      "epoch": 0.24295010845986983,
      "grad_norm": 1.0698862075805664,
      "learning_rate": 0.00019945218953682734,
      "loss": 3.4747,
      "step": 56
    },
    {
      "epoch": 0.2472885032537961,
      "grad_norm": 1.0480446815490723,
      "learning_rate": 0.00019925461516413223,
      "loss": 3.5076,
      "step": 57
    },
    {
      "epoch": 0.25162689804772237,
      "grad_norm": 1.1356984376907349,
      "learning_rate": 0.00019902680687415705,
      "loss": 3.4875,
      "step": 58
    },
    {
      "epoch": 0.25162689804772237,
      "eval_loss": 3.5454585552215576,
      "eval_runtime": 43.9485,
      "eval_samples_per_second": 8.851,
      "eval_steps_per_second": 2.23,
      "step": 58
    },
    {
      "epoch": 0.2559652928416486,
      "grad_norm": 1.147985816001892,
      "learning_rate": 0.00019876883405951377,
      "loss": 3.5368,
      "step": 59
    },
    {
      "epoch": 0.2603036876355748,
      "grad_norm": 1.167962670326233,
      "learning_rate": 0.00019848077530122083,
      "loss": 3.4885,
      "step": 60
    },
    {
      "epoch": 0.2646420824295011,
      "grad_norm": 1.1241693496704102,
      "learning_rate": 0.00019816271834476642,
      "loss": 3.5335,
      "step": 61
    },
    {
      "epoch": 0.26898047722342733,
      "grad_norm": 1.0841178894042969,
      "learning_rate": 0.00019781476007338058,
      "loss": 3.5822,
      "step": 62
    },
    {
      "epoch": 0.27331887201735355,
      "grad_norm": 1.1276164054870605,
      "learning_rate": 0.00019743700647852354,
      "loss": 3.4757,
      "step": 63
    },
    {
      "epoch": 0.27765726681127983,
      "grad_norm": 1.192659854888916,
      "learning_rate": 0.00019702957262759965,
      "loss": 3.4212,
      "step": 64
    },
    {
      "epoch": 0.28199566160520606,
      "grad_norm": 1.2061688899993896,
      "learning_rate": 0.00019659258262890683,
      "loss": 3.4564,
      "step": 65
    },
    {
      "epoch": 0.28633405639913234,
      "grad_norm": 1.4012079238891602,
      "learning_rate": 0.0001961261695938319,
      "loss": 3.423,
      "step": 66
    },
    {
      "epoch": 0.29067245119305857,
      "grad_norm": 1.3591368198394775,
      "learning_rate": 0.00019563047559630357,
      "loss": 3.5284,
      "step": 67
    },
    {
      "epoch": 0.2950108459869848,
      "grad_norm": 1.3555010557174683,
      "learning_rate": 0.00019510565162951537,
      "loss": 3.4406,
      "step": 68
    },
    {
      "epoch": 0.2993492407809111,
      "grad_norm": 1.4745391607284546,
      "learning_rate": 0.0001945518575599317,
      "loss": 3.3899,
      "step": 69
    },
    {
      "epoch": 0.3036876355748373,
      "grad_norm": 1.6432572603225708,
      "learning_rate": 0.00019396926207859084,
      "loss": 3.4343,
      "step": 70
    },
    {
      "epoch": 0.3080260303687636,
      "grad_norm": 1.9187488555908203,
      "learning_rate": 0.00019335804264972018,
      "loss": 3.5881,
      "step": 71
    },
    {
      "epoch": 0.3123644251626898,
      "grad_norm": 2.1937949657440186,
      "learning_rate": 0.00019271838545667876,
      "loss": 3.3765,
      "step": 72
    },
    {
      "epoch": 0.31670281995661603,
      "grad_norm": 2.376640558242798,
      "learning_rate": 0.00019205048534524406,
      "loss": 3.2758,
      "step": 73
    },
    {
      "epoch": 0.3210412147505423,
      "grad_norm": 3.0442004203796387,
      "learning_rate": 0.0001913545457642601,
      "loss": 3.5366,
      "step": 74
    },
    {
      "epoch": 0.32537960954446854,
      "grad_norm": 3.6359612941741943,
      "learning_rate": 0.000190630778703665,
      "loss": 3.0313,
      "step": 75
    },
    {
      "epoch": 0.3297180043383948,
      "grad_norm": 4.367193698883057,
      "learning_rate": 0.0001898794046299167,
      "loss": 3.3864,
      "step": 76
    },
    {
      "epoch": 0.33405639913232105,
      "grad_norm": 5.261653900146484,
      "learning_rate": 0.0001891006524188368,
      "loss": 3.5411,
      "step": 77
    },
    {
      "epoch": 0.3383947939262473,
      "grad_norm": 5.341316223144531,
      "learning_rate": 0.00018829475928589271,
      "loss": 3.843,
      "step": 78
    },
    {
      "epoch": 0.34273318872017355,
      "grad_norm": 2.9030559062957764,
      "learning_rate": 0.00018746197071393958,
      "loss": 3.4254,
      "step": 79
    },
    {
      "epoch": 0.3470715835140998,
      "grad_norm": 1.4780312776565552,
      "learning_rate": 0.00018660254037844388,
      "loss": 3.4877,
      "step": 80
    },
    {
      "epoch": 0.351409978308026,
      "grad_norm": 1.0593628883361816,
      "learning_rate": 0.00018571673007021123,
      "loss": 3.3987,
      "step": 81
    },
    {
      "epoch": 0.3557483731019523,
      "grad_norm": 0.9910492897033691,
      "learning_rate": 0.0001848048096156426,
      "loss": 3.4944,
      "step": 82
    },
    {
      "epoch": 0.3600867678958785,
      "grad_norm": 1.004767656326294,
      "learning_rate": 0.00018386705679454242,
      "loss": 3.4143,
      "step": 83
    },
    {
      "epoch": 0.3644251626898048,
      "grad_norm": 1.012804627418518,
      "learning_rate": 0.00018290375725550417,
      "loss": 3.4504,
      "step": 84
    },
    {
      "epoch": 0.368763557483731,
      "grad_norm": 1.0758857727050781,
      "learning_rate": 0.0001819152044288992,
      "loss": 3.5181,
      "step": 85
    },
    {
      "epoch": 0.37310195227765725,
      "grad_norm": 1.0776313543319702,
      "learning_rate": 0.00018090169943749476,
      "loss": 3.4293,
      "step": 86
    },
    {
      "epoch": 0.3774403470715835,
      "grad_norm": 1.0856565237045288,
      "learning_rate": 0.00017986355100472928,
      "loss": 3.3012,
      "step": 87
    },
    {
      "epoch": 0.38177874186550975,
      "grad_norm": 1.146246075630188,
      "learning_rate": 0.00017880107536067218,
      "loss": 3.5778,
      "step": 88
    },
    {
      "epoch": 0.38611713665943603,
      "grad_norm": 1.1812922954559326,
      "learning_rate": 0.0001777145961456971,
      "loss": 3.2835,
      "step": 89
    },
    {
      "epoch": 0.39045553145336226,
      "grad_norm": 1.3535960912704468,
      "learning_rate": 0.0001766044443118978,
      "loss": 3.1863,
      "step": 90
    },
    {
      "epoch": 0.3947939262472885,
      "grad_norm": 1.312524437904358,
      "learning_rate": 0.00017547095802227723,
      "loss": 3.3794,
      "step": 91
    },
    {
      "epoch": 0.39913232104121477,
      "grad_norm": 1.2628040313720703,
      "learning_rate": 0.00017431448254773944,
      "loss": 3.2127,
      "step": 92
    },
    {
      "epoch": 0.403470715835141,
      "grad_norm": 1.3810231685638428,
      "learning_rate": 0.00017313537016191706,
      "loss": 3.3664,
      "step": 93
    },
    {
      "epoch": 0.4078091106290672,
      "grad_norm": 1.5726513862609863,
      "learning_rate": 0.0001719339800338651,
      "loss": 3.4052,
      "step": 94
    },
    {
      "epoch": 0.4121475054229935,
      "grad_norm": 1.5839647054672241,
      "learning_rate": 0.00017071067811865476,
      "loss": 3.2746,
      "step": 95
    },
    {
      "epoch": 0.4164859002169197,
      "grad_norm": 1.7605924606323242,
      "learning_rate": 0.00016946583704589973,
      "loss": 3.48,
      "step": 96
    },
    {
      "epoch": 0.420824295010846,
      "grad_norm": 2.3345723152160645,
      "learning_rate": 0.00016819983600624986,
      "loss": 3.2033,
      "step": 97
    },
    {
      "epoch": 0.42516268980477223,
      "grad_norm": 1.9480637311935425,
      "learning_rate": 0.00016691306063588583,
      "loss": 3.3796,
      "step": 98
    },
    {
      "epoch": 0.42950108459869846,
      "grad_norm": 2.3618791103363037,
      "learning_rate": 0.00016560590289905073,
      "loss": 3.1398,
      "step": 99
    },
    {
      "epoch": 0.43383947939262474,
      "grad_norm": 3.546729326248169,
      "learning_rate": 0.00016427876096865394,
      "loss": 3.0558,
      "step": 100
    },
    {
      "epoch": 0.43817787418655096,
      "grad_norm": 1.5932743549346924,
      "learning_rate": 0.00016293203910498376,
      "loss": 3.3932,
      "step": 101
    },
    {
      "epoch": 0.44251626898047725,
      "grad_norm": 2.039661407470703,
      "learning_rate": 0.0001615661475325658,
      "loss": 3.4066,
      "step": 102
    },
    {
      "epoch": 0.44685466377440347,
      "grad_norm": 1.742119312286377,
      "learning_rate": 0.00016018150231520486,
      "loss": 3.2823,
      "step": 103
    },
    {
      "epoch": 0.4511930585683297,
      "grad_norm": 1.5700186491012573,
      "learning_rate": 0.00015877852522924732,
      "loss": 3.3591,
      "step": 104
    },
    {
      "epoch": 0.455531453362256,
      "grad_norm": 1.136389970779419,
      "learning_rate": 0.0001573576436351046,
      "loss": 3.4663,
      "step": 105
    },
    {
      "epoch": 0.4598698481561822,
      "grad_norm": 0.8537334203720093,
      "learning_rate": 0.0001559192903470747,
      "loss": 3.4367,
      "step": 106
    },
    {
      "epoch": 0.4642082429501085,
      "grad_norm": 0.8642299175262451,
      "learning_rate": 0.00015446390350150273,
      "loss": 3.287,
      "step": 107
    },
    {
      "epoch": 0.4685466377440347,
      "grad_norm": 0.9279235601425171,
      "learning_rate": 0.0001529919264233205,
      "loss": 3.2911,
      "step": 108
    },
    {
      "epoch": 0.47288503253796094,
      "grad_norm": 0.9121331572532654,
      "learning_rate": 0.00015150380749100545,
      "loss": 3.3101,
      "step": 109
    },
    {
      "epoch": 0.4772234273318872,
      "grad_norm": 0.9868795275688171,
      "learning_rate": 0.00015000000000000001,
      "loss": 3.3431,
      "step": 110
    },
    {
      "epoch": 0.48156182212581344,
      "grad_norm": 1.0646886825561523,
      "learning_rate": 0.00014848096202463372,
      "loss": 3.3876,
      "step": 111
    },
    {
      "epoch": 0.48590021691973967,
      "grad_norm": 1.0819416046142578,
      "learning_rate": 0.00014694715627858908,
      "loss": 3.2128,
      "step": 112
    },
    {
      "epoch": 0.49023861171366595,
      "grad_norm": 1.0728636980056763,
      "learning_rate": 0.00014539904997395468,
      "loss": 3.2076,
      "step": 113
    },
    {
      "epoch": 0.4945770065075922,
      "grad_norm": 1.1562669277191162,
      "learning_rate": 0.00014383711467890774,
      "loss": 3.2825,
      "step": 114
    },
    {
      "epoch": 0.49891540130151846,
      "grad_norm": 1.1967557668685913,
      "learning_rate": 0.00014226182617406996,
      "loss": 3.2185,
      "step": 115
    },
    {
      "epoch": 0.5032537960954447,
      "grad_norm": 1.3139584064483643,
      "learning_rate": 0.00014067366430758004,
      "loss": 3.1373,
      "step": 116
    },
    {
      "epoch": 0.5032537960954447,
      "eval_loss": 3.2728052139282227,
      "eval_runtime": 43.9403,
      "eval_samples_per_second": 8.853,
      "eval_steps_per_second": 2.23,
      "step": 116
    },
    {
      "epoch": 0.5075921908893709,
      "grad_norm": 1.3170753717422485,
      "learning_rate": 0.00013907311284892736,
      "loss": 2.9572,
      "step": 117
    },
    {
      "epoch": 0.5119305856832972,
      "grad_norm": 1.5243107080459595,
      "learning_rate": 0.00013746065934159123,
      "loss": 3.3082,
      "step": 118
    },
    {
      "epoch": 0.5162689804772235,
      "grad_norm": 1.5845880508422852,
      "learning_rate": 0.00013583679495453,
      "loss": 3.4819,
      "step": 119
    },
    {
      "epoch": 0.5206073752711496,
      "grad_norm": 1.66307532787323,
      "learning_rate": 0.00013420201433256689,
      "loss": 3.1131,
      "step": 120
    },
    {
      "epoch": 0.5249457700650759,
      "grad_norm": 1.6470588445663452,
      "learning_rate": 0.00013255681544571568,
      "loss": 3.2847,
      "step": 121
    },
    {
      "epoch": 0.5292841648590022,
      "grad_norm": 2.1118075847625732,
      "learning_rate": 0.00013090169943749476,
      "loss": 3.4669,
      "step": 122
    },
    {
      "epoch": 0.5336225596529284,
      "grad_norm": 2.056396722793579,
      "learning_rate": 0.00012923717047227368,
      "loss": 3.1136,
      "step": 123
    },
    {
      "epoch": 0.5379609544468547,
      "grad_norm": 2.2389657497406006,
      "learning_rate": 0.0001275637355816999,
      "loss": 2.9323,
      "step": 124
    },
    {
      "epoch": 0.5422993492407809,
      "grad_norm": 2.863621711730957,
      "learning_rate": 0.00012588190451025207,
      "loss": 2.9585,
      "step": 125
    },
    {
      "epoch": 0.5466377440347071,
      "grad_norm": 0.8712321519851685,
      "learning_rate": 0.00012419218955996676,
      "loss": 3.1439,
      "step": 126
    },
    {
      "epoch": 0.5509761388286334,
      "grad_norm": 1.0713740587234497,
      "learning_rate": 0.0001224951054343865,
      "loss": 3.2213,
      "step": 127
    },
    {
      "epoch": 0.5553145336225597,
      "grad_norm": 1.104315996170044,
      "learning_rate": 0.00012079116908177593,
      "loss": 3.4522,
      "step": 128
    },
    {
      "epoch": 0.559652928416486,
      "grad_norm": 1.0883917808532715,
      "learning_rate": 0.00011908089953765449,
      "loss": 3.3503,
      "step": 129
    },
    {
      "epoch": 0.5639913232104121,
      "grad_norm": 1.0000834465026855,
      "learning_rate": 0.00011736481776669306,
      "loss": 3.4036,
      "step": 130
    },
    {
      "epoch": 0.5683297180043384,
      "grad_norm": 0.8869354128837585,
      "learning_rate": 0.0001156434465040231,
      "loss": 3.2749,
      "step": 131
    },
    {
      "epoch": 0.5726681127982647,
      "grad_norm": 0.8651937246322632,
      "learning_rate": 0.00011391731009600654,
      "loss": 3.3679,
      "step": 132
    },
    {
      "epoch": 0.5770065075921909,
      "grad_norm": 0.9174556136131287,
      "learning_rate": 0.00011218693434051475,
      "loss": 3.311,
      "step": 133
    },
    {
      "epoch": 0.5813449023861171,
      "grad_norm": 0.930533230304718,
      "learning_rate": 0.00011045284632676536,
      "loss": 3.3761,
      "step": 134
    },
    {
      "epoch": 0.5856832971800434,
      "grad_norm": 0.9851680994033813,
      "learning_rate": 0.00010871557427476583,
      "loss": 3.2752,
      "step": 135
    },
    {
      "epoch": 0.5900216919739696,
      "grad_norm": 0.9633740782737732,
      "learning_rate": 0.00010697564737441252,
      "loss": 3.2373,
      "step": 136
    },
    {
      "epoch": 0.5943600867678959,
      "grad_norm": 1.132585048675537,
      "learning_rate": 0.0001052335956242944,
      "loss": 3.2323,
      "step": 137
    },
    {
      "epoch": 0.5986984815618221,
      "grad_norm": 1.1232091188430786,
      "learning_rate": 0.00010348994967025012,
      "loss": 3.2874,
      "step": 138
    },
    {
      "epoch": 0.6030368763557483,
      "grad_norm": 1.2559125423431396,
      "learning_rate": 0.00010174524064372837,
      "loss": 3.2367,
      "step": 139
    },
    {
      "epoch": 0.6073752711496746,
      "grad_norm": 1.2623041868209839,
      "learning_rate": 0.0001,
      "loss": 3.2243,
      "step": 140
    },
    {
      "epoch": 0.6117136659436009,
      "grad_norm": 1.3554457426071167,
      "learning_rate": 9.825475935627165e-05,
      "loss": 3.4802,
      "step": 141
    },
    {
      "epoch": 0.6160520607375272,
      "grad_norm": 1.4170132875442505,
      "learning_rate": 9.651005032974994e-05,
      "loss": 3.354,
      "step": 142
    },
    {
      "epoch": 0.6203904555314533,
      "grad_norm": 1.4309097528457642,
      "learning_rate": 9.476640437570562e-05,
      "loss": 3.1352,
      "step": 143
    },
    {
      "epoch": 0.6247288503253796,
      "grad_norm": 1.5829153060913086,
      "learning_rate": 9.302435262558747e-05,
      "loss": 3.2455,
      "step": 144
    },
    {
      "epoch": 0.6290672451193059,
      "grad_norm": 1.8210502862930298,
      "learning_rate": 9.128442572523417e-05,
      "loss": 3.2991,
      "step": 145
    },
    {
      "epoch": 0.6334056399132321,
      "grad_norm": 1.842761516571045,
      "learning_rate": 8.954715367323468e-05,
      "loss": 3.2255,
      "step": 146
    },
    {
      "epoch": 0.6377440347071583,
      "grad_norm": 1.9258646965026855,
      "learning_rate": 8.781306565948528e-05,
      "loss": 3.1397,
      "step": 147
    },
    {
      "epoch": 0.6420824295010846,
      "grad_norm": 2.1189215183258057,
      "learning_rate": 8.608268990399349e-05,
      "loss": 3.0414,
      "step": 148
    },
    {
      "epoch": 0.6464208242950108,
      "grad_norm": 2.4063761234283447,
      "learning_rate": 8.435655349597689e-05,
      "loss": 2.8524,
      "step": 149
    },
    {
      "epoch": 0.6507592190889371,
      "grad_norm": 3.6420836448669434,
      "learning_rate": 8.263518223330697e-05,
      "loss": 3.0156,
      "step": 150
    },
    {
      "epoch": 0.6550976138828634,
      "grad_norm": 0.7080674171447754,
      "learning_rate": 8.091910046234552e-05,
      "loss": 3.1636,
      "step": 151
    },
    {
      "epoch": 0.6594360086767896,
      "grad_norm": 0.798520565032959,
      "learning_rate": 7.920883091822408e-05,
      "loss": 3.212,
      "step": 152
    },
    {
      "epoch": 0.6637744034707158,
      "grad_norm": 0.8640486001968384,
      "learning_rate": 7.750489456561352e-05,
      "loss": 3.1644,
      "step": 153
    },
    {
      "epoch": 0.6681127982646421,
      "grad_norm": 0.870906412601471,
      "learning_rate": 7.580781044003324e-05,
      "loss": 3.1876,
      "step": 154
    },
    {
      "epoch": 0.6724511930585684,
      "grad_norm": 0.8581348061561584,
      "learning_rate": 7.411809548974792e-05,
      "loss": 3.2739,
      "step": 155
    },
    {
      "epoch": 0.6767895878524945,
      "grad_norm": 0.8691614270210266,
      "learning_rate": 7.243626441830009e-05,
      "loss": 3.2444,
      "step": 156
    },
    {
      "epoch": 0.6811279826464208,
      "grad_norm": 0.9455673098564148,
      "learning_rate": 7.076282952772633e-05,
      "loss": 3.3004,
      "step": 157
    },
    {
      "epoch": 0.6854663774403471,
      "grad_norm": 0.8873337507247925,
      "learning_rate": 6.909830056250527e-05,
      "loss": 3.1778,
      "step": 158
    },
    {
      "epoch": 0.6898047722342733,
      "grad_norm": 0.910775363445282,
      "learning_rate": 6.744318455428436e-05,
      "loss": 3.1346,
      "step": 159
    },
    {
      "epoch": 0.6941431670281996,
      "grad_norm": 0.9872409105300903,
      "learning_rate": 6.579798566743314e-05,
      "loss": 3.1665,
      "step": 160
    },
    {
      "epoch": 0.6984815618221258,
      "grad_norm": 1.0516481399536133,
      "learning_rate": 6.416320504546997e-05,
      "loss": 3.3064,
      "step": 161
    },
    {
      "epoch": 0.702819956616052,
      "grad_norm": 1.0263571739196777,
      "learning_rate": 6.25393406584088e-05,
      "loss": 3.3698,
      "step": 162
    },
    {
      "epoch": 0.7071583514099783,
      "grad_norm": 1.1050878763198853,
      "learning_rate": 6.092688715107264e-05,
      "loss": 3.2436,
      "step": 163
    },
    {
      "epoch": 0.7114967462039046,
      "grad_norm": 1.1121841669082642,
      "learning_rate": 5.9326335692419995e-05,
      "loss": 2.9433,
      "step": 164
    },
    {
      "epoch": 0.7158351409978309,
      "grad_norm": 1.2424358129501343,
      "learning_rate": 5.773817382593008e-05,
      "loss": 3.3616,
      "step": 165
    },
    {
      "epoch": 0.720173535791757,
      "grad_norm": 1.1899327039718628,
      "learning_rate": 5.616288532109225e-05,
      "loss": 3.0392,
      "step": 166
    },
    {
      "epoch": 0.7245119305856833,
      "grad_norm": 1.3395730257034302,
      "learning_rate": 5.4600950026045326e-05,
      "loss": 3.0905,
      "step": 167
    },
    {
      "epoch": 0.7288503253796096,
      "grad_norm": 1.4268842935562134,
      "learning_rate": 5.305284372141095e-05,
      "loss": 3.1247,
      "step": 168
    },
    {
      "epoch": 0.7331887201735358,
      "grad_norm": 1.5514875650405884,
      "learning_rate": 5.15190379753663e-05,
      "loss": 3.3772,
      "step": 169
    },
    {
      "epoch": 0.737527114967462,
      "grad_norm": 1.8371058702468872,
      "learning_rate": 5.000000000000002e-05,
      "loss": 2.9262,
      "step": 170
    },
    {
      "epoch": 0.7418655097613883,
      "grad_norm": 1.7641676664352417,
      "learning_rate": 4.8496192508994576e-05,
      "loss": 3.0113,
      "step": 171
    },
    {
      "epoch": 0.7462039045553145,
      "grad_norm": 1.8325039148330688,
      "learning_rate": 4.700807357667952e-05,
      "loss": 3.0551,
      "step": 172
    },
    {
      "epoch": 0.7505422993492408,
      "grad_norm": 1.9740185737609863,
      "learning_rate": 4.5536096498497295e-05,
      "loss": 3.1479,
      "step": 173
    },
    {
      "epoch": 0.754880694143167,
      "grad_norm": 2.327420234680176,
      "learning_rate": 4.4080709652925336e-05,
      "loss": 3.2149,
      "step": 174
    },
    {
      "epoch": 0.754880694143167,
      "eval_loss": 3.163884162902832,
      "eval_runtime": 43.9914,
      "eval_samples_per_second": 8.843,
      "eval_steps_per_second": 2.228,
      "step": 174
    },
    {
      "epoch": 0.7592190889370932,
      "grad_norm": 3.558817148208618,
      "learning_rate": 4.264235636489542e-05,
      "loss": 3.0184,
      "step": 175
    },
    {
      "epoch": 0.7635574837310195,
      "grad_norm": 0.48007091879844666,
      "learning_rate": 4.12214747707527e-05,
      "loss": 3.0708,
      "step": 176
    },
    {
      "epoch": 0.7678958785249458,
      "grad_norm": 0.606035590171814,
      "learning_rate": 3.981849768479517e-05,
      "loss": 3.1584,
      "step": 177
    },
    {
      "epoch": 0.7722342733188721,
      "grad_norm": 0.6647348999977112,
      "learning_rate": 3.843385246743417e-05,
      "loss": 3.3003,
      "step": 178
    },
    {
      "epoch": 0.7765726681127982,
      "grad_norm": 0.6956514120101929,
      "learning_rate": 3.7067960895016275e-05,
      "loss": 3.2168,
      "step": 179
    },
    {
      "epoch": 0.7809110629067245,
      "grad_norm": 0.7575390338897705,
      "learning_rate": 3.5721239031346066e-05,
      "loss": 3.2576,
      "step": 180
    },
    {
      "epoch": 0.7852494577006508,
      "grad_norm": 0.8106915950775146,
      "learning_rate": 3.439409710094929e-05,
      "loss": 3.124,
      "step": 181
    },
    {
      "epoch": 0.789587852494577,
      "grad_norm": 0.873997688293457,
      "learning_rate": 3.308693936411421e-05,
      "loss": 3.0977,
      "step": 182
    },
    {
      "epoch": 0.7939262472885033,
      "grad_norm": 0.9612168073654175,
      "learning_rate": 3.1800163993750166e-05,
      "loss": 3.435,
      "step": 183
    },
    {
      "epoch": 0.7982646420824295,
      "grad_norm": 0.9549990892410278,
      "learning_rate": 3.053416295410026e-05,
      "loss": 3.2216,
      "step": 184
    },
    {
      "epoch": 0.8026030368763557,
      "grad_norm": 0.9309582710266113,
      "learning_rate": 2.9289321881345254e-05,
      "loss": 3.0546,
      "step": 185
    },
    {
      "epoch": 0.806941431670282,
      "grad_norm": 1.0800687074661255,
      "learning_rate": 2.8066019966134904e-05,
      "loss": 3.2283,
      "step": 186
    },
    {
      "epoch": 0.8112798264642083,
      "grad_norm": 1.0009733438491821,
      "learning_rate": 2.6864629838082956e-05,
      "loss": 3.1638,
      "step": 187
    },
    {
      "epoch": 0.8156182212581344,
      "grad_norm": 1.1134998798370361,
      "learning_rate": 2.5685517452260567e-05,
      "loss": 3.2642,
      "step": 188
    },
    {
      "epoch": 0.8199566160520607,
      "grad_norm": 1.1395593881607056,
      "learning_rate": 2.45290419777228e-05,
      "loss": 3.0712,
      "step": 189
    },
    {
      "epoch": 0.824295010845987,
      "grad_norm": 1.1547160148620605,
      "learning_rate": 2.339555568810221e-05,
      "loss": 3.2101,
      "step": 190
    },
    {
      "epoch": 0.8286334056399133,
      "grad_norm": 1.2223323583602905,
      "learning_rate": 2.2285403854302912e-05,
      "loss": 3.1109,
      "step": 191
    },
    {
      "epoch": 0.8329718004338394,
      "grad_norm": 1.4417051076889038,
      "learning_rate": 2.119892463932781e-05,
      "loss": 3.2497,
      "step": 192
    },
    {
      "epoch": 0.8373101952277657,
      "grad_norm": 1.3542780876159668,
      "learning_rate": 2.013644899527074e-05,
      "loss": 3.23,
      "step": 193
    },
    {
      "epoch": 0.841648590021692,
      "grad_norm": 1.5529882907867432,
      "learning_rate": 1.9098300562505266e-05,
      "loss": 3.2404,
      "step": 194
    },
    {
      "epoch": 0.8459869848156182,
      "grad_norm": 1.63187575340271,
      "learning_rate": 1.808479557110081e-05,
      "loss": 3.0776,
      "step": 195
    },
    {
      "epoch": 0.8503253796095445,
      "grad_norm": 1.6470518112182617,
      "learning_rate": 1.7096242744495837e-05,
      "loss": 3.1312,
      "step": 196
    },
    {
      "epoch": 0.8546637744034707,
      "grad_norm": 1.8358676433563232,
      "learning_rate": 1.6132943205457606e-05,
      "loss": 3.0136,
      "step": 197
    },
    {
      "epoch": 0.8590021691973969,
      "grad_norm": 2.2392208576202393,
      "learning_rate": 1.5195190384357404e-05,
      "loss": 3.0873,
      "step": 198
    },
    {
      "epoch": 0.8633405639913232,
      "grad_norm": 2.3587329387664795,
      "learning_rate": 1.4283269929788779e-05,
      "loss": 3.1336,
      "step": 199
    },
    {
      "epoch": 0.8676789587852495,
      "grad_norm": 3.4689748287200928,
      "learning_rate": 1.339745962155613e-05,
      "loss": 3.2269,
      "step": 200
    },
    {
      "epoch": 0.8720173535791758,
      "grad_norm": 0.4676075577735901,
      "learning_rate": 1.2538029286060426e-05,
      "loss": 3.2194,
      "step": 201
    },
    {
      "epoch": 0.8763557483731019,
      "grad_norm": 0.5948041081428528,
      "learning_rate": 1.1705240714107302e-05,
      "loss": 3.2006,
      "step": 202
    },
    {
      "epoch": 0.8806941431670282,
      "grad_norm": 0.6200747489929199,
      "learning_rate": 1.0899347581163221e-05,
      "loss": 3.1966,
      "step": 203
    },
    {
      "epoch": 0.8850325379609545,
      "grad_norm": 0.6264815926551819,
      "learning_rate": 1.0120595370083318e-05,
      "loss": 3.1552,
      "step": 204
    },
    {
      "epoch": 0.8893709327548807,
      "grad_norm": 0.6958035230636597,
      "learning_rate": 9.369221296335006e-06,
      "loss": 3.21,
      "step": 205
    },
    {
      "epoch": 0.8937093275488069,
      "grad_norm": 0.7550477981567383,
      "learning_rate": 8.645454235739903e-06,
      "loss": 3.2491,
      "step": 206
    },
    {
      "epoch": 0.8980477223427332,
      "grad_norm": 0.78013014793396,
      "learning_rate": 7.949514654755962e-06,
      "loss": 3.158,
      "step": 207
    },
    {
      "epoch": 0.9023861171366594,
      "grad_norm": 0.786949098110199,
      "learning_rate": 7.281614543321269e-06,
      "loss": 3.2446,
      "step": 208
    },
    {
      "epoch": 0.9067245119305857,
      "grad_norm": 0.8102577924728394,
      "learning_rate": 6.6419573502798374e-06,
      "loss": 3.2238,
      "step": 209
    },
    {
      "epoch": 0.911062906724512,
      "grad_norm": 0.8839837908744812,
      "learning_rate": 6.030737921409169e-06,
      "loss": 3.1035,
      "step": 210
    },
    {
      "epoch": 0.9154013015184381,
      "grad_norm": 0.9286414980888367,
      "learning_rate": 5.448142440068316e-06,
      "loss": 3.2198,
      "step": 211
    },
    {
      "epoch": 0.9197396963123644,
      "grad_norm": 1.031367540359497,
      "learning_rate": 4.8943483704846475e-06,
      "loss": 3.207,
      "step": 212
    },
    {
      "epoch": 0.9240780911062907,
      "grad_norm": 1.1086468696594238,
      "learning_rate": 4.369524403696457e-06,
      "loss": 3.2715,
      "step": 213
    },
    {
      "epoch": 0.928416485900217,
      "grad_norm": 1.0586810111999512,
      "learning_rate": 3.873830406168111e-06,
      "loss": 3.2091,
      "step": 214
    },
    {
      "epoch": 0.9327548806941431,
      "grad_norm": 1.0433012247085571,
      "learning_rate": 3.40741737109318e-06,
      "loss": 3.11,
      "step": 215
    },
    {
      "epoch": 0.9370932754880694,
      "grad_norm": 1.214693546295166,
      "learning_rate": 2.970427372400353e-06,
      "loss": 3.1984,
      "step": 216
    },
    {
      "epoch": 0.9414316702819957,
      "grad_norm": 1.3140201568603516,
      "learning_rate": 2.5629935214764865e-06,
      "loss": 3.1381,
      "step": 217
    },
    {
      "epoch": 0.9457700650759219,
      "grad_norm": 1.439610242843628,
      "learning_rate": 2.1852399266194314e-06,
      "loss": 3.2828,
      "step": 218
    },
    {
      "epoch": 0.9501084598698482,
      "grad_norm": 1.447763204574585,
      "learning_rate": 1.8372816552336026e-06,
      "loss": 3.1896,
      "step": 219
    },
    {
      "epoch": 0.9544468546637744,
      "grad_norm": 1.5651224851608276,
      "learning_rate": 1.5192246987791981e-06,
      "loss": 3.0176,
      "step": 220
    },
    {
      "epoch": 0.9587852494577006,
      "grad_norm": 1.7138340473175049,
      "learning_rate": 1.231165940486234e-06,
      "loss": 3.0649,
      "step": 221
    },
    {
      "epoch": 0.9631236442516269,
      "grad_norm": 1.7278990745544434,
      "learning_rate": 9.731931258429638e-07,
      "loss": 2.901,
      "step": 222
    },
    {
      "epoch": 0.9674620390455532,
      "grad_norm": 1.8585275411605835,
      "learning_rate": 7.453848358678017e-07,
      "loss": 2.9228,
      "step": 223
    },
    {
      "epoch": 0.9718004338394793,
      "grad_norm": 2.438549757003784,
      "learning_rate": 5.478104631726711e-07,
      "loss": 2.9518,
      "step": 224
    },
    {
      "epoch": 0.9761388286334056,
      "grad_norm": 3.6199636459350586,
      "learning_rate": 3.805301908254455e-07,
      "loss": 2.9323,
      "step": 225
    },
    {
      "epoch": 0.9804772234273319,
      "grad_norm": 0.5968942046165466,
      "learning_rate": 2.4359497401758024e-07,
      "loss": 3.0911,
      "step": 226
    },
    {
      "epoch": 0.9848156182212582,
      "grad_norm": 0.8684574365615845,
      "learning_rate": 1.3704652454261668e-07,
      "loss": 3.2219,
      "step": 227
    },
    {
      "epoch": 0.9891540130151844,
      "grad_norm": 1.1482932567596436,
      "learning_rate": 6.09172980904238e-08,
      "loss": 3.0405,
      "step": 228
    },
    {
      "epoch": 0.9934924078091106,
      "grad_norm": 1.4431898593902588,
      "learning_rate": 1.5230484360873044e-08,
      "loss": 3.1692,
      "step": 229
    },
    {
      "epoch": 0.9978308026030369,
      "grad_norm": 1.78467857837677,
      "learning_rate": 0.0,
      "loss": 2.932,
      "step": 230
    }
  ],
  "logging_steps": 1,
  "max_steps": 230,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 58,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 5.142510989790413e+17,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}