{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9968051118210862,
  "eval_steps": 59,
  "global_step": 234,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.004259850905218318,
      "grad_norm": 0.42660918831825256,
      "learning_rate": 2e-05,
      "loss": 1.4194,
      "step": 1
    },
    {
      "epoch": 0.004259850905218318,
      "eval_loss": 1.3981385231018066,
      "eval_runtime": 17.5749,
      "eval_samples_per_second": 22.475,
      "eval_steps_per_second": 2.845,
      "step": 1
    },
    {
      "epoch": 0.008519701810436636,
      "grad_norm": 0.38132771849632263,
      "learning_rate": 4e-05,
      "loss": 1.4291,
      "step": 2
    },
    {
      "epoch": 0.012779552715654952,
      "grad_norm": 0.4677501916885376,
      "learning_rate": 6e-05,
      "loss": 1.606,
      "step": 3
    },
    {
      "epoch": 0.01703940362087327,
      "grad_norm": 0.4839603900909424,
      "learning_rate": 8e-05,
      "loss": 1.5193,
      "step": 4
    },
    {
      "epoch": 0.021299254526091587,
      "grad_norm": 0.52900630235672,
      "learning_rate": 0.0001,
      "loss": 1.7253,
      "step": 5
    },
    {
      "epoch": 0.025559105431309903,
      "grad_norm": 0.4611320495605469,
      "learning_rate": 0.00012,
      "loss": 1.4042,
      "step": 6
    },
    {
      "epoch": 0.029818956336528223,
      "grad_norm": 0.5078997611999512,
      "learning_rate": 0.00014,
      "loss": 1.8641,
      "step": 7
    },
    {
      "epoch": 0.03407880724174654,
      "grad_norm": 0.5692968368530273,
      "learning_rate": 0.00016,
      "loss": 1.0603,
      "step": 8
    },
    {
      "epoch": 0.038338658146964855,
      "grad_norm": 0.5424911379814148,
      "learning_rate": 0.00018,
      "loss": 0.9217,
      "step": 9
    },
    {
      "epoch": 0.042598509052183174,
      "grad_norm": 0.6595712304115295,
      "learning_rate": 0.0002,
      "loss": 1.0443,
      "step": 10
    },
    {
      "epoch": 0.046858359957401494,
      "grad_norm": 0.552948534488678,
      "learning_rate": 0.00019999016517595753,
      "loss": 0.9727,
      "step": 11
    },
    {
      "epoch": 0.051118210862619806,
      "grad_norm": 0.523713231086731,
      "learning_rate": 0.00019996066263830531,
      "loss": 1.0042,
      "step": 12
    },
    {
      "epoch": 0.055378061767838126,
      "grad_norm": 0.3326718807220459,
      "learning_rate": 0.0001999114981900887,
      "loss": 0.6851,
      "step": 13
    },
    {
      "epoch": 0.059637912673056445,
      "grad_norm": 0.40246546268463135,
      "learning_rate": 0.00019984268150178167,
      "loss": 0.6865,
      "step": 14
    },
    {
      "epoch": 0.06389776357827476,
      "grad_norm": 0.3299888074398041,
      "learning_rate": 0.00019975422610938462,
      "loss": 0.6413,
      "step": 15
    },
    {
      "epoch": 0.06815761448349308,
      "grad_norm": 0.321532666683197,
      "learning_rate": 0.00019964614941176195,
      "loss": 0.6425,
      "step": 16
    },
    {
      "epoch": 0.0724174653887114,
      "grad_norm": 0.30551549792289734,
      "learning_rate": 0.0001995184726672197,
      "loss": 0.6573,
      "step": 17
    },
    {
      "epoch": 0.07667731629392971,
      "grad_norm": 0.3162730932235718,
      "learning_rate": 0.00019937122098932428,
      "loss": 0.7957,
      "step": 18
    },
    {
      "epoch": 0.08093716719914804,
      "grad_norm": 0.2646523714065552,
      "learning_rate": 0.00019920442334196248,
      "loss": 0.6842,
      "step": 19
    },
    {
      "epoch": 0.08519701810436635,
      "grad_norm": 0.35260164737701416,
      "learning_rate": 0.00019901811253364456,
      "loss": 0.7045,
      "step": 20
    },
    {
      "epoch": 0.08945686900958466,
      "grad_norm": 0.36994901299476624,
      "learning_rate": 0.00019881232521105089,
      "loss": 0.7506,
      "step": 21
    },
    {
      "epoch": 0.09371671991480299,
      "grad_norm": 0.3145638108253479,
      "learning_rate": 0.0001985871018518236,
      "loss": 0.6522,
      "step": 22
    },
    {
      "epoch": 0.0979765708200213,
      "grad_norm": 0.28740495443344116,
      "learning_rate": 0.00019834248675660486,
      "loss": 0.5763,
      "step": 23
    },
    {
      "epoch": 0.10223642172523961,
      "grad_norm": 0.29527685046195984,
      "learning_rate": 0.00019807852804032305,
      "loss": 0.8533,
      "step": 24
    },
    {
      "epoch": 0.10649627263045794,
      "grad_norm": 0.3023378849029541,
      "learning_rate": 0.00019779527762272877,
      "loss": 0.738,
      "step": 25
    },
    {
      "epoch": 0.11075612353567625,
      "grad_norm": 0.2749658524990082,
      "learning_rate": 0.00019749279121818235,
      "loss": 0.6354,
      "step": 26
    },
    {
      "epoch": 0.11501597444089456,
      "grad_norm": 0.3914307951927185,
      "learning_rate": 0.0001971711283246951,
      "loss": 0.8604,
      "step": 27
    },
    {
      "epoch": 0.11927582534611289,
      "grad_norm": 0.47873714566230774,
      "learning_rate": 0.00019683035221222618,
      "loss": 0.7972,
      "step": 28
    },
    {
      "epoch": 0.1235356762513312,
      "grad_norm": 0.22174575924873352,
      "learning_rate": 0.0001964705299102376,
      "loss": 0.4385,
      "step": 29
    },
    {
      "epoch": 0.12779552715654952,
      "grad_norm": 0.244963139295578,
      "learning_rate": 0.00019609173219450998,
      "loss": 0.7168,
      "step": 30
    },
    {
      "epoch": 0.13205537806176784,
      "grad_norm": 0.32758575677871704,
      "learning_rate": 0.0001956940335732209,
      "loss": 0.7231,
      "step": 31
    },
    {
      "epoch": 0.13631522896698617,
      "grad_norm": 0.21992172300815582,
      "learning_rate": 0.00019527751227228963,
      "loss": 0.662,
      "step": 32
    },
    {
      "epoch": 0.14057507987220447,
      "grad_norm": 0.2899262309074402,
      "learning_rate": 0.0001948422502199903,
      "loss": 0.4651,
      "step": 33
    },
    {
      "epoch": 0.1448349307774228,
      "grad_norm": 0.23878340423107147,
      "learning_rate": 0.00019438833303083678,
      "loss": 0.5367,
      "step": 34
    },
    {
      "epoch": 0.14909478168264112,
      "grad_norm": 0.20475314557552338,
      "learning_rate": 0.0001939158499887428,
      "loss": 0.4024,
      "step": 35
    },
    {
      "epoch": 0.15335463258785942,
      "grad_norm": 0.25068745017051697,
      "learning_rate": 0.00019342489402945998,
      "loss": 0.6575,
      "step": 36
    },
    {
      "epoch": 0.15761448349307774,
      "grad_norm": 0.3811924159526825,
      "learning_rate": 0.00019291556172229785,
      "loss": 0.6405,
      "step": 37
    },
    {
      "epoch": 0.16187433439829607,
      "grad_norm": 0.2627577483654022,
      "learning_rate": 0.0001923879532511287,
      "loss": 0.6961,
      "step": 38
    },
    {
      "epoch": 0.16613418530351437,
      "grad_norm": 0.32665154337882996,
      "learning_rate": 0.00019184217239468212,
      "loss": 0.6983,
      "step": 39
    },
    {
      "epoch": 0.1703940362087327,
      "grad_norm": 0.24597743153572083,
      "learning_rate": 0.00019127832650613189,
      "loss": 0.54,
      "step": 40
    },
    {
      "epoch": 0.17465388711395102,
      "grad_norm": 0.2611660361289978,
      "learning_rate": 0.00019069652649198005,
      "loss": 0.6281,
      "step": 41
    },
    {
      "epoch": 0.17891373801916932,
      "grad_norm": 0.2969326078891754,
      "learning_rate": 0.0001900968867902419,
      "loss": 0.6817,
      "step": 42
    },
    {
      "epoch": 0.18317358892438765,
      "grad_norm": 0.27561935782432556,
      "learning_rate": 0.00018947952534793661,
      "loss": 0.626,
      "step": 43
    },
    {
      "epoch": 0.18743343982960597,
      "grad_norm": 0.33468887209892273,
      "learning_rate": 0.00018884456359788724,
      "loss": 0.7383,
      "step": 44
    },
    {
      "epoch": 0.19169329073482427,
      "grad_norm": 0.2937297224998474,
      "learning_rate": 0.0001881921264348355,
      "loss": 0.6972,
      "step": 45
    },
    {
      "epoch": 0.1959531416400426,
      "grad_norm": 0.33218011260032654,
      "learning_rate": 0.00018752234219087538,
      "loss": 0.6749,
      "step": 46
    },
    {
      "epoch": 0.20021299254526093,
      "grad_norm": 0.2661404311656952,
      "learning_rate": 0.00018683534261021057,
      "loss": 0.4882,
      "step": 47
    },
    {
      "epoch": 0.20447284345047922,
      "grad_norm": 0.2451002150774002,
      "learning_rate": 0.00018613126282324092,
      "loss": 0.637,
      "step": 48
    },
    {
      "epoch": 0.20873269435569755,
      "grad_norm": 0.27517661452293396,
      "learning_rate": 0.00018541024131998274,
      "loss": 0.5483,
      "step": 49
    },
    {
      "epoch": 0.21299254526091588,
      "grad_norm": 0.24373459815979004,
      "learning_rate": 0.00018467241992282843,
      "loss": 0.5112,
      "step": 50
    },
    {
      "epoch": 0.21725239616613418,
      "grad_norm": 0.3239864408969879,
      "learning_rate": 0.00018391794375865024,
      "loss": 0.8005,
      "step": 51
    },
    {
      "epoch": 0.2215122470713525,
      "grad_norm": 0.29262682795524597,
      "learning_rate": 0.00018314696123025454,
      "loss": 0.6769,
      "step": 52
    },
    {
      "epoch": 0.22577209797657083,
      "grad_norm": 0.28277888894081116,
      "learning_rate": 0.00018235962398719147,
      "loss": 0.6892,
      "step": 53
    },
    {
      "epoch": 0.23003194888178913,
      "grad_norm": 0.41741546988487244,
      "learning_rate": 0.00018155608689592604,
      "loss": 0.6763,
      "step": 54
    },
    {
      "epoch": 0.23429179978700745,
      "grad_norm": 0.2734082043170929,
      "learning_rate": 0.00018073650800937624,
      "loss": 0.697,
      "step": 55
    },
    {
      "epoch": 0.23855165069222578,
      "grad_norm": 0.2646290957927704,
      "learning_rate": 0.00017990104853582493,
      "loss": 0.5936,
      "step": 56
    },
    {
      "epoch": 0.24281150159744408,
      "grad_norm": 0.27723610401153564,
      "learning_rate": 0.00017904987280721035,
      "loss": 0.5875,
      "step": 57
    },
    {
      "epoch": 0.2470713525026624,
      "grad_norm": 0.2668153643608093,
      "learning_rate": 0.000178183148246803,
      "loss": 0.5219,
      "step": 58
    },
    {
      "epoch": 0.25133120340788073,
      "grad_norm": 0.29033368825912476,
      "learning_rate": 0.0001773010453362737,
      "loss": 0.5997,
      "step": 59
    },
    {
      "epoch": 0.25133120340788073,
      "eval_loss": 0.5784963965415955,
      "eval_runtime": 17.4317,
      "eval_samples_per_second": 22.66,
      "eval_steps_per_second": 2.868,
      "step": 59
    },
    {
      "epoch": 0.25559105431309903,
      "grad_norm": 0.2783537209033966,
      "learning_rate": 0.00017640373758216077,
      "loss": 0.483,
      "step": 60
    },
    {
      "epoch": 0.2598509052183174,
      "grad_norm": 0.31082215905189514,
      "learning_rate": 0.0001754914014817416,
      "loss": 0.6473,
      "step": 61
    },
    {
      "epoch": 0.2641107561235357,
      "grad_norm": 0.3206618130207062,
      "learning_rate": 0.00017456421648831655,
      "loss": 0.6289,
      "step": 62
    },
    {
      "epoch": 0.268370607028754,
      "grad_norm": 0.2875254154205322,
      "learning_rate": 0.00017362236497591094,
      "loss": 0.594,
      "step": 63
    },
    {
      "epoch": 0.27263045793397234,
      "grad_norm": 0.22950579226016998,
      "learning_rate": 0.0001726660322034027,
      "loss": 0.3886,
      "step": 64
    },
    {
      "epoch": 0.27689030883919064,
      "grad_norm": 0.24293649196624756,
      "learning_rate": 0.00017169540627808274,
      "loss": 0.6129,
      "step": 65
    },
    {
      "epoch": 0.28115015974440893,
      "grad_norm": 0.2611636519432068,
      "learning_rate": 0.00017071067811865476,
      "loss": 0.6891,
      "step": 66
    },
    {
      "epoch": 0.2854100106496273,
      "grad_norm": 0.284407377243042,
      "learning_rate": 0.00016971204141768233,
      "loss": 0.516,
      "step": 67
    },
    {
      "epoch": 0.2896698615548456,
      "grad_norm": 0.21485944092273712,
      "learning_rate": 0.00016869969260349018,
      "loss": 0.3826,
      "step": 68
    },
    {
      "epoch": 0.2939297124600639,
      "grad_norm": 0.29337963461875916,
      "learning_rate": 0.00016767383080152742,
      "loss": 0.5696,
      "step": 69
    },
    {
      "epoch": 0.29818956336528224,
      "grad_norm": 0.27099764347076416,
      "learning_rate": 0.0001666346577952004,
      "loss": 0.4708,
      "step": 70
    },
    {
      "epoch": 0.30244941427050054,
      "grad_norm": 0.29055824875831604,
      "learning_rate": 0.00016558237798618245,
      "loss": 0.5844,
      "step": 71
    },
    {
      "epoch": 0.30670926517571884,
      "grad_norm": 0.22874757647514343,
      "learning_rate": 0.00016451719835420877,
      "loss": 0.4412,
      "step": 72
    },
    {
      "epoch": 0.3109691160809372,
      "grad_norm": 0.2926221489906311,
      "learning_rate": 0.00016343932841636456,
      "loss": 0.5757,
      "step": 73
    },
    {
      "epoch": 0.3152289669861555,
      "grad_norm": 0.30070438981056213,
      "learning_rate": 0.00016234898018587337,
      "loss": 0.6063,
      "step": 74
    },
    {
      "epoch": 0.3194888178913738,
      "grad_norm": 0.2475481927394867,
      "learning_rate": 0.00016124636813039502,
      "loss": 0.5056,
      "step": 75
    },
    {
      "epoch": 0.32374866879659214,
      "grad_norm": 0.2851349711418152,
      "learning_rate": 0.00016013170912984058,
      "loss": 0.7547,
      "step": 76
    },
    {
      "epoch": 0.32800851970181044,
      "grad_norm": 0.25569260120391846,
      "learning_rate": 0.00015900522243371282,
      "loss": 0.5168,
      "step": 77
    },
    {
      "epoch": 0.33226837060702874,
      "grad_norm": 0.3774610757827759,
      "learning_rate": 0.0001578671296179806,
      "loss": 0.6691,
      "step": 78
    },
    {
      "epoch": 0.3365282215122471,
      "grad_norm": 0.2339468151330948,
      "learning_rate": 0.00015671765454149559,
      "loss": 0.5021,
      "step": 79
    },
    {
      "epoch": 0.3407880724174654,
      "grad_norm": 0.3066350519657135,
      "learning_rate": 0.00015555702330196023,
      "loss": 0.6838,
      "step": 80
    },
    {
      "epoch": 0.3450479233226837,
      "grad_norm": 0.271908164024353,
      "learning_rate": 0.00015438546419145488,
      "loss": 0.4837,
      "step": 81
    },
    {
      "epoch": 0.34930777422790205,
      "grad_norm": 0.304290771484375,
      "learning_rate": 0.00015320320765153367,
      "loss": 0.6768,
      "step": 82
    },
    {
      "epoch": 0.35356762513312034,
      "grad_norm": 0.25685280561447144,
      "learning_rate": 0.00015201048622789747,
      "loss": 0.4335,
      "step": 83
    },
    {
      "epoch": 0.35782747603833864,
      "grad_norm": 0.3003567159175873,
      "learning_rate": 0.00015080753452465296,
      "loss": 0.5836,
      "step": 84
    },
    {
      "epoch": 0.362087326943557,
      "grad_norm": 0.2585873007774353,
      "learning_rate": 0.0001495945891581668,
      "loss": 0.5391,
      "step": 85
    },
    {
      "epoch": 0.3663471778487753,
      "grad_norm": 0.30791282653808594,
      "learning_rate": 0.000148371888710524,
      "loss": 0.5103,
      "step": 86
    },
    {
      "epoch": 0.3706070287539936,
      "grad_norm": 0.23016773164272308,
      "learning_rate": 0.0001471396736825998,
      "loss": 0.4269,
      "step": 87
    },
    {
      "epoch": 0.37486687965921195,
      "grad_norm": 0.3137454390525818,
      "learning_rate": 0.00014589818644675378,
      "loss": 0.5116,
      "step": 88
    },
    {
      "epoch": 0.37912673056443025,
      "grad_norm": 0.28078484535217285,
      "learning_rate": 0.00014464767119915629,
      "loss": 0.4388,
      "step": 89
    },
    {
      "epoch": 0.38338658146964855,
      "grad_norm": 0.3163893222808838,
      "learning_rate": 0.00014338837391175582,
      "loss": 0.6122,
      "step": 90
    },
    {
      "epoch": 0.3876464323748669,
      "grad_norm": 0.34674668312072754,
      "learning_rate": 0.0001421205422838971,
      "loss": 0.7114,
      "step": 91
    },
    {
      "epoch": 0.3919062832800852,
      "grad_norm": 0.2210942953824997,
      "learning_rate": 0.00014084442569359964,
      "loss": 0.3351,
      "step": 92
    },
    {
      "epoch": 0.3961661341853035,
      "grad_norm": 0.30586308240890503,
      "learning_rate": 0.0001395602751485059,
      "loss": 0.4845,
      "step": 93
    },
    {
      "epoch": 0.40042598509052185,
      "grad_norm": 0.2695784568786621,
      "learning_rate": 0.000138268343236509,
      "loss": 0.4992,
      "step": 94
    },
    {
      "epoch": 0.40468583599574015,
      "grad_norm": 0.2989813983440399,
      "learning_rate": 0.00013696888407606952,
      "loss": 0.585,
      "step": 95
    },
    {
      "epoch": 0.40894568690095845,
      "grad_norm": 0.2759920656681061,
      "learning_rate": 0.0001356621532662313,
      "loss": 0.4492,
      "step": 96
    },
    {
      "epoch": 0.4132055378061768,
      "grad_norm": 0.33117353916168213,
      "learning_rate": 0.0001343484078363461,
      "loss": 0.5606,
      "step": 97
    },
    {
      "epoch": 0.4174653887113951,
      "grad_norm": 0.24572253227233887,
      "learning_rate": 0.00013302790619551674,
      "loss": 0.3261,
      "step": 98
    },
    {
      "epoch": 0.4217252396166134,
      "grad_norm": 0.322480171918869,
      "learning_rate": 0.00013170090808176883,
      "loss": 0.5527,
      "step": 99
    },
    {
      "epoch": 0.42598509052183176,
      "grad_norm": 0.3101179301738739,
      "learning_rate": 0.00013036767451096148,
      "loss": 0.5419,
      "step": 100
    },
    {
      "epoch": 0.43024494142705005,
      "grad_norm": 0.3218703269958496,
      "learning_rate": 0.00012902846772544624,
      "loss": 0.5441,
      "step": 101
    },
    {
      "epoch": 0.43450479233226835,
      "grad_norm": 0.26214686036109924,
      "learning_rate": 0.00012768355114248494,
      "loss": 0.5388,
      "step": 102
    },
    {
      "epoch": 0.4387646432374867,
      "grad_norm": 0.421612411737442,
      "learning_rate": 0.00012633318930243648,
      "loss": 0.7557,
      "step": 103
    },
    {
      "epoch": 0.443024494142705,
      "grad_norm": 0.5120344758033752,
      "learning_rate": 0.0001249776478167227,
      "loss": 0.7028,
      "step": 104
    },
    {
      "epoch": 0.4472843450479233,
      "grad_norm": 0.27614736557006836,
      "learning_rate": 0.00012361719331558345,
      "loss": 0.3954,
      "step": 105
    },
    {
      "epoch": 0.45154419595314166,
      "grad_norm": 0.269520103931427,
      "learning_rate": 0.00012225209339563145,
      "loss": 0.4851,
      "step": 106
    },
    {
      "epoch": 0.45580404685835996,
      "grad_norm": 0.2739225924015045,
      "learning_rate": 0.000120882616567217,
      "loss": 0.4907,
      "step": 107
    },
    {
      "epoch": 0.46006389776357826,
      "grad_norm": 0.33920663595199585,
      "learning_rate": 0.00011950903220161285,
      "loss": 0.6288,
      "step": 108
    },
    {
      "epoch": 0.4643237486687966,
      "grad_norm": 0.279832124710083,
      "learning_rate": 0.00011813161047802985,
      "loss": 0.447,
      "step": 109
    },
    {
      "epoch": 0.4685835995740149,
      "grad_norm": 0.31790605187416077,
      "learning_rate": 0.00011675062233047364,
      "loss": 0.5933,
      "step": 110
    },
    {
      "epoch": 0.4728434504792332,
      "grad_norm": 0.24926939606666565,
      "learning_rate": 0.000115366339394453,
      "loss": 0.4061,
      "step": 111
    },
    {
      "epoch": 0.47710330138445156,
      "grad_norm": 0.3327280282974243,
      "learning_rate": 0.00011397903395354996,
      "loss": 0.484,
      "step": 112
    },
    {
      "epoch": 0.48136315228966986,
      "grad_norm": 0.37822094559669495,
      "learning_rate": 0.00011258897888586255,
      "loss": 0.6416,
      "step": 113
    },
    {
      "epoch": 0.48562300319488816,
      "grad_norm": 0.35605669021606445,
      "learning_rate": 0.00011119644761033078,
      "loss": 0.6136,
      "step": 114
    },
    {
      "epoch": 0.4898828541001065,
      "grad_norm": 0.3513132929801941,
      "learning_rate": 0.0001098017140329561,
      "loss": 0.6299,
      "step": 115
    },
    {
      "epoch": 0.4941427050053248,
      "grad_norm": 0.3040708899497986,
      "learning_rate": 0.00010840505249292476,
      "loss": 0.4658,
      "step": 116
    },
    {
      "epoch": 0.4984025559105431,
      "grad_norm": 0.19006308913230896,
      "learning_rate": 0.00010700673770864673,
      "loss": 0.2694,
      "step": 117
    },
    {
      "epoch": 0.5026624068157615,
      "grad_norm": 0.30643633008003235,
      "learning_rate": 0.00010560704472371919,
      "loss": 0.4492,
      "step": 118
    },
    {
      "epoch": 0.5026624068157615,
      "eval_loss": 0.5326976180076599,
      "eval_runtime": 17.5872,
      "eval_samples_per_second": 22.46,
      "eval_steps_per_second": 2.843,
      "step": 118
    },
    {
      "epoch": 0.5069222577209798,
      "grad_norm": 0.3698013722896576,
      "learning_rate": 0.00010420624885282653,
      "loss": 0.6993,
      "step": 119
    },
    {
      "epoch": 0.5111821086261981,
      "grad_norm": 0.2801634967327118,
      "learning_rate": 0.0001028046256275869,
      "loss": 0.4059,
      "step": 120
    },
    {
      "epoch": 0.5154419595314164,
      "grad_norm": 0.2864643931388855,
      "learning_rate": 0.00010140245074235624,
      "loss": 0.5024,
      "step": 121
    },
    {
      "epoch": 0.5197018104366348,
      "grad_norm": 0.30105265974998474,
      "learning_rate": 0.0001,
      "loss": 0.6774,
      "step": 122
    },
    {
      "epoch": 0.5239616613418531,
      "grad_norm": 0.39152050018310547,
      "learning_rate": 9.859754925764378e-05,
      "loss": 0.625,
      "step": 123
    },
    {
      "epoch": 0.5282215122470714,
      "grad_norm": 0.3618883192539215,
      "learning_rate": 9.719537437241312e-05,
      "loss": 0.6978,
      "step": 124
    },
    {
      "epoch": 0.5324813631522897,
      "grad_norm": 0.23670899868011475,
      "learning_rate": 9.579375114717351e-05,
      "loss": 0.3379,
      "step": 125
    },
    {
      "epoch": 0.536741214057508,
      "grad_norm": 0.3124864101409912,
      "learning_rate": 9.439295527628081e-05,
      "loss": 0.525,
      "step": 126
    },
    {
      "epoch": 0.5410010649627263,
      "grad_norm": 0.3667398989200592,
      "learning_rate": 9.299326229135326e-05,
      "loss": 0.6164,
      "step": 127
    },
    {
      "epoch": 0.5452609158679447,
      "grad_norm": 0.2894105613231659,
      "learning_rate": 9.159494750707526e-05,
      "loss": 0.4335,
      "step": 128
    },
    {
      "epoch": 0.549520766773163,
      "grad_norm": 0.30680200457572937,
      "learning_rate": 9.019828596704394e-05,
      "loss": 0.4507,
      "step": 129
    },
    {
      "epoch": 0.5537806176783813,
      "grad_norm": 0.3676758110523224,
      "learning_rate": 8.880355238966923e-05,
      "loss": 0.5955,
      "step": 130
    },
    {
      "epoch": 0.5580404685835996,
      "grad_norm": 0.3194178342819214,
      "learning_rate": 8.741102111413748e-05,
      "loss": 0.5675,
      "step": 131
    },
    {
      "epoch": 0.5623003194888179,
      "grad_norm": 0.29750558733940125,
      "learning_rate": 8.602096604645009e-05,
      "loss": 0.5785,
      "step": 132
    },
    {
      "epoch": 0.5665601703940362,
      "grad_norm": 0.37204545736312866,
      "learning_rate": 8.463366060554698e-05,
      "loss": 0.612,
      "step": 133
    },
    {
      "epoch": 0.5708200212992546,
      "grad_norm": 0.36891940236091614,
      "learning_rate": 8.324937766952638e-05,
      "loss": 0.5463,
      "step": 134
    },
    {
      "epoch": 0.5750798722044729,
      "grad_norm": 0.2863575518131256,
      "learning_rate": 8.186838952197018e-05,
      "loss": 0.4884,
      "step": 135
    },
    {
      "epoch": 0.5793397231096912,
      "grad_norm": 0.354523241519928,
      "learning_rate": 8.049096779838719e-05,
      "loss": 0.7727,
      "step": 136
    },
    {
      "epoch": 0.5835995740149095,
      "grad_norm": 0.30339759588241577,
      "learning_rate": 7.911738343278304e-05,
      "loss": 0.5543,
      "step": 137
    },
    {
      "epoch": 0.5878594249201278,
      "grad_norm": 0.27778202295303345,
      "learning_rate": 7.774790660436858e-05,
      "loss": 0.4716,
      "step": 138
    },
    {
      "epoch": 0.5921192758253461,
      "grad_norm": 0.38618960976600647,
      "learning_rate": 7.63828066844166e-05,
      "loss": 0.6519,
      "step": 139
    },
    {
      "epoch": 0.5963791267305645,
      "grad_norm": 0.3573627769947052,
      "learning_rate": 7.502235218327731e-05,
      "loss": 0.5128,
      "step": 140
    },
    {
      "epoch": 0.6006389776357828,
      "grad_norm": 0.30529165267944336,
      "learning_rate": 7.366681069756352e-05,
      "loss": 0.5184,
      "step": 141
    },
    {
      "epoch": 0.6048988285410011,
      "grad_norm": 0.2819828987121582,
      "learning_rate": 7.231644885751507e-05,
      "loss": 0.4259,
      "step": 142
    },
    {
      "epoch": 0.6091586794462194,
      "grad_norm": 0.32307252287864685,
      "learning_rate": 7.097153227455379e-05,
      "loss": 0.6048,
      "step": 143
    },
    {
      "epoch": 0.6134185303514377,
      "grad_norm": 0.31262722611427307,
      "learning_rate": 6.963232548903853e-05,
      "loss": 0.4834,
      "step": 144
    },
    {
      "epoch": 0.617678381256656,
      "grad_norm": 0.318851500749588,
      "learning_rate": 6.829909191823121e-05,
      "loss": 0.5011,
      "step": 145
    },
    {
      "epoch": 0.6219382321618744,
      "grad_norm": 0.44246405363082886,
      "learning_rate": 6.697209380448333e-05,
      "loss": 0.4384,
      "step": 146
    },
    {
      "epoch": 0.6261980830670927,
      "grad_norm": 0.3459945023059845,
      "learning_rate": 6.565159216365389e-05,
      "loss": 0.5657,
      "step": 147
    },
    {
      "epoch": 0.630457933972311,
      "grad_norm": 0.33843329548835754,
      "learning_rate": 6.43378467337687e-05,
      "loss": 0.5711,
      "step": 148
    },
    {
      "epoch": 0.6347177848775293,
      "grad_norm": 0.3812694549560547,
      "learning_rate": 6.30311159239305e-05,
      "loss": 0.6142,
      "step": 149
    },
    {
      "epoch": 0.6389776357827476,
      "grad_norm": 0.29333916306495667,
      "learning_rate": 6.173165676349103e-05,
      "loss": 0.585,
      "step": 150
    },
    {
      "epoch": 0.6432374866879659,
      "grad_norm": 0.2884041666984558,
      "learning_rate": 6.043972485149414e-05,
      "loss": 0.4866,
      "step": 151
    },
    {
      "epoch": 0.6474973375931843,
      "grad_norm": 0.33954814076423645,
      "learning_rate": 5.9155574306400395e-05,
      "loss": 0.571,
      "step": 152
    },
    {
      "epoch": 0.6517571884984026,
      "grad_norm": 0.33935782313346863,
      "learning_rate": 5.787945771610296e-05,
      "loss": 0.5037,
      "step": 153
    },
    {
      "epoch": 0.6560170394036209,
      "grad_norm": 0.27371054887771606,
      "learning_rate": 5.6611626088244194e-05,
      "loss": 0.3322,
      "step": 154
    },
    {
      "epoch": 0.6602768903088392,
      "grad_norm": 0.30788496136665344,
      "learning_rate": 5.5352328800843724e-05,
      "loss": 0.4454,
      "step": 155
    },
    {
      "epoch": 0.6645367412140575,
      "grad_norm": 0.34366151690483093,
      "learning_rate": 5.410181355324622e-05,
      "loss": 0.5788,
      "step": 156
    },
    {
      "epoch": 0.6687965921192758,
      "grad_norm": 0.33698371052742004,
      "learning_rate": 5.286032631740023e-05,
      "loss": 0.4378,
      "step": 157
    },
    {
      "epoch": 0.6730564430244942,
      "grad_norm": 0.4181162416934967,
      "learning_rate": 5.162811128947602e-05,
      "loss": 0.5367,
      "step": 158
    },
    {
      "epoch": 0.6773162939297125,
      "grad_norm": 0.4480881690979004,
      "learning_rate": 5.0405410841833253e-05,
      "loss": 0.6633,
      "step": 159
    },
    {
      "epoch": 0.6815761448349308,
      "grad_norm": 0.37488028407096863,
      "learning_rate": 4.919246547534708e-05,
      "loss": 0.5402,
      "step": 160
    },
    {
      "epoch": 0.6858359957401491,
      "grad_norm": 0.2964366376399994,
      "learning_rate": 4.7989513772102537e-05,
      "loss": 0.4109,
      "step": 161
    },
    {
      "epoch": 0.6900958466453674,
      "grad_norm": 0.35376259684562683,
      "learning_rate": 4.6796792348466356e-05,
      "loss": 0.636,
      "step": 162
    },
    {
      "epoch": 0.6943556975505857,
      "grad_norm": 0.3158915638923645,
      "learning_rate": 4.561453580854516e-05,
      "loss": 0.4893,
      "step": 163
    },
    {
      "epoch": 0.6986155484558041,
      "grad_norm": 0.420785516500473,
      "learning_rate": 4.444297669803981e-05,
      "loss": 0.7147,
      "step": 164
    },
    {
      "epoch": 0.7028753993610224,
      "grad_norm": 0.3272782564163208,
      "learning_rate": 4.328234545850442e-05,
      "loss": 0.3444,
      "step": 165
    },
    {
      "epoch": 0.7071352502662407,
      "grad_norm": 0.30052492022514343,
      "learning_rate": 4.213287038201943e-05,
      "loss": 0.5209,
      "step": 166
    },
    {
      "epoch": 0.711395101171459,
      "grad_norm": 0.37648481130599976,
      "learning_rate": 4.0994777566287204e-05,
      "loss": 0.684,
      "step": 167
    },
    {
      "epoch": 0.7156549520766773,
      "grad_norm": 0.3135606646537781,
      "learning_rate": 3.9868290870159405e-05,
      "loss": 0.4871,
      "step": 168
    },
    {
      "epoch": 0.7199148029818956,
      "grad_norm": 0.33847576379776,
      "learning_rate": 3.875363186960499e-05,
      "loss": 0.5294,
      "step": 169
    },
    {
      "epoch": 0.724174653887114,
      "grad_norm": 0.3337070047855377,
      "learning_rate": 3.7651019814126654e-05,
      "loss": 0.4425,
      "step": 170
    },
    {
      "epoch": 0.7284345047923323,
      "grad_norm": 0.4173165261745453,
      "learning_rate": 3.6560671583635467e-05,
      "loss": 0.637,
      "step": 171
    },
    {
      "epoch": 0.7326943556975506,
      "grad_norm": 0.41098451614379883,
      "learning_rate": 3.548280164579126e-05,
      "loss": 0.52,
      "step": 172
    },
    {
      "epoch": 0.7369542066027689,
      "grad_norm": 0.3789665699005127,
      "learning_rate": 3.4417622013817595e-05,
      "loss": 0.5995,
      "step": 173
    },
    {
      "epoch": 0.7412140575079872,
      "grad_norm": 0.3996846675872803,
      "learning_rate": 3.336534220479961e-05,
      "loss": 0.6237,
      "step": 174
    },
    {
      "epoch": 0.7454739084132055,
      "grad_norm": 0.3990687131881714,
      "learning_rate": 3.2326169198472556e-05,
      "loss": 0.555,
      "step": 175
    },
    {
      "epoch": 0.7497337593184239,
      "grad_norm": 0.32280924916267395,
      "learning_rate": 3.130030739650983e-05,
      "loss": 0.4742,
      "step": 176
    },
    {
      "epoch": 0.7539936102236422,
      "grad_norm": 0.4192362129688263,
      "learning_rate": 3.0287958582317676e-05,
      "loss": 0.6569,
      "step": 177
    },
    {
      "epoch": 0.7539936102236422,
      "eval_loss": 0.5110090970993042,
      "eval_runtime": 20.8066,
      "eval_samples_per_second": 18.984,
      "eval_steps_per_second": 2.403,
      "step": 177
    },
    {
      "epoch": 0.7582534611288605,
      "grad_norm": 0.35410746932029724,
      "learning_rate": 2.9289321881345254e-05,
      "loss": 0.4828,
      "step": 178
    },
    {
      "epoch": 0.7625133120340788,
      "grad_norm": 0.4463326036930084,
      "learning_rate": 2.8304593721917285e-05,
      "loss": 0.6976,
      "step": 179
    },
    {
      "epoch": 0.7667731629392971,
      "grad_norm": 0.29797378182411194,
      "learning_rate": 2.7333967796597315e-05,
      "loss": 0.564,
      "step": 180
    },
    {
      "epoch": 0.7710330138445154,
      "grad_norm": 0.31337812542915344,
      "learning_rate": 2.6377635024089087e-05,
      "loss": 0.5607,
      "step": 181
    },
    {
      "epoch": 0.7752928647497338,
      "grad_norm": 0.40470513701438904,
      "learning_rate": 2.5435783511683443e-05,
      "loss": 0.6428,
      "step": 182
    },
    {
      "epoch": 0.7795527156549521,
      "grad_norm": 0.413817822933197,
      "learning_rate": 2.450859851825842e-05,
      "loss": 0.7303,
      "step": 183
    },
    {
      "epoch": 0.7838125665601704,
      "grad_norm": 0.2931414842605591,
      "learning_rate": 2.3596262417839255e-05,
      "loss": 0.4051,
      "step": 184
    },
    {
      "epoch": 0.7880724174653887,
      "grad_norm": 0.34086865186691284,
      "learning_rate": 2.26989546637263e-05,
      "loss": 0.4329,
      "step": 185
    },
    {
      "epoch": 0.792332268370607,
      "grad_norm": 0.40336307883262634,
      "learning_rate": 2.181685175319702e-05,
      "loss": 0.5791,
      "step": 186
    },
    {
      "epoch": 0.7965921192758253,
      "grad_norm": 0.30092838406562805,
      "learning_rate": 2.095012719278966e-05,
      "loss": 0.4491,
      "step": 187
    },
    {
      "epoch": 0.8008519701810437,
      "grad_norm": 0.31043168902397156,
      "learning_rate": 2.009895146417512e-05,
      "loss": 0.4681,
      "step": 188
    },
    {
      "epoch": 0.805111821086262,
      "grad_norm": 0.3712119162082672,
      "learning_rate": 1.926349199062376e-05,
      "loss": 0.549,
      "step": 189
    },
    {
      "epoch": 0.8093716719914803,
      "grad_norm": 0.3679051995277405,
      "learning_rate": 1.8443913104073983e-05,
      "loss": 0.4971,
      "step": 190
    },
    {
      "epoch": 0.8136315228966986,
      "grad_norm": 0.3244669735431671,
      "learning_rate": 1.7640376012808536e-05,
      "loss": 0.4732,
      "step": 191
    },
    {
      "epoch": 0.8178913738019169,
      "grad_norm": 0.28653696179389954,
      "learning_rate": 1.6853038769745467e-05,
      "loss": 0.3469,
      "step": 192
    },
    {
      "epoch": 0.8221512247071352,
      "grad_norm": 0.3144218325614929,
      "learning_rate": 1.6082056241349786e-05,
      "loss": 0.5127,
      "step": 193
    },
    {
      "epoch": 0.8264110756123536,
      "grad_norm": 0.3801470994949341,
      "learning_rate": 1.5327580077171587e-05,
      "loss": 0.5178,
      "step": 194
    },
    {
      "epoch": 0.8306709265175719,
      "grad_norm": 0.37223386764526367,
      "learning_rate": 1.4589758680017263e-05,
      "loss": 0.5114,
      "step": 195
    },
    {
      "epoch": 0.8349307774227902,
      "grad_norm": 0.4167802333831787,
      "learning_rate": 1.3868737176759106e-05,
      "loss": 0.6949,
      "step": 196
    },
    {
      "epoch": 0.8391906283280085,
      "grad_norm": 0.620794951915741,
      "learning_rate": 1.3164657389789458e-05,
      "loss": 0.7015,
      "step": 197
    },
    {
      "epoch": 0.8434504792332268,
      "grad_norm": 0.32053133845329285,
      "learning_rate": 1.2477657809124631e-05,
      "loss": 0.5328,
      "step": 198
    },
    {
      "epoch": 0.8477103301384451,
      "grad_norm": 0.41892528533935547,
      "learning_rate": 1.1807873565164506e-05,
      "loss": 0.5929,
      "step": 199
    },
    {
      "epoch": 0.8519701810436635,
      "grad_norm": 0.2980664372444153,
      "learning_rate": 1.1155436402112785e-05,
      "loss": 0.4209,
      "step": 200
    },
    {
      "epoch": 0.8562300319488818,
      "grad_norm": 0.3290930986404419,
      "learning_rate": 1.0520474652063394e-05,
      "loss": 0.4423,
      "step": 201
    },
    {
      "epoch": 0.8604898828541001,
      "grad_norm": 0.3246372640132904,
      "learning_rate": 9.903113209758096e-06,
      "loss": 0.4631,
      "step": 202
    },
    {
      "epoch": 0.8647497337593184,
      "grad_norm": 0.3644905388355255,
      "learning_rate": 9.303473508019944e-06,
      "loss": 0.552,
      "step": 203
    },
    {
      "epoch": 0.8690095846645367,
      "grad_norm": 0.49974295496940613,
      "learning_rate": 8.72167349386811e-06,
      "loss": 0.7516,
      "step": 204
    },
    {
      "epoch": 0.873269435569755,
      "grad_norm": 0.3242340087890625,
      "learning_rate": 8.157827605317892e-06,
      "loss": 0.412,
      "step": 205
    },
    {
      "epoch": 0.8775292864749734,
      "grad_norm": 0.33690881729125977,
      "learning_rate": 7.612046748871327e-06,
      "loss": 0.4841,
      "step": 206
    },
    {
      "epoch": 0.8817891373801917,
      "grad_norm": 0.3246766924858093,
      "learning_rate": 7.084438277702188e-06,
      "loss": 0.4341,
      "step": 207
    },
    {
      "epoch": 0.88604898828541,
      "grad_norm": 0.4262131452560425,
      "learning_rate": 6.5751059705400295e-06,
      "loss": 0.6306,
      "step": 208
    },
    {
      "epoch": 0.8903088391906283,
      "grad_norm": 0.32158687710762024,
      "learning_rate": 6.084150011257239e-06,
      "loss": 0.4687,
      "step": 209
    },
    {
      "epoch": 0.8945686900958466,
      "grad_norm": 0.377208948135376,
      "learning_rate": 5.611666969163243e-06,
      "loss": 0.5849,
      "step": 210
    },
    {
      "epoch": 0.898828541001065,
      "grad_norm": 0.30956804752349854,
      "learning_rate": 5.157749780009735e-06,
      "loss": 0.4355,
      "step": 211
    },
    {
      "epoch": 0.9030883919062833,
      "grad_norm": 0.4885202944278717,
      "learning_rate": 4.722487727710368e-06,
      "loss": 0.6129,
      "step": 212
    },
    {
      "epoch": 0.9073482428115016,
      "grad_norm": 0.3384571075439453,
      "learning_rate": 4.305966426779118e-06,
      "loss": 0.4345,
      "step": 213
    },
    {
      "epoch": 0.9116080937167199,
      "grad_norm": 0.4629303514957428,
      "learning_rate": 3.908267805490051e-06,
      "loss": 0.6158,
      "step": 214
    },
    {
      "epoch": 0.9158679446219382,
      "grad_norm": 0.3206894099712372,
      "learning_rate": 3.529470089762421e-06,
      "loss": 0.4689,
      "step": 215
    },
    {
      "epoch": 0.9201277955271565,
      "grad_norm": 0.41424816846847534,
      "learning_rate": 3.169647787773866e-06,
      "loss": 0.5235,
      "step": 216
    },
    {
      "epoch": 0.9243876464323749,
      "grad_norm": 0.3189912736415863,
      "learning_rate": 2.8288716753049005e-06,
      "loss": 0.4262,
      "step": 217
    },
    {
      "epoch": 0.9286474973375932,
      "grad_norm": 0.3202993869781494,
      "learning_rate": 2.5072087818176382e-06,
      "loss": 0.493,
      "step": 218
    },
    {
      "epoch": 0.9329073482428115,
      "grad_norm": 0.32974228262901306,
      "learning_rate": 2.20472237727124e-06,
      "loss": 0.5512,
      "step": 219
    },
    {
      "epoch": 0.9371671991480298,
      "grad_norm": 0.27346375584602356,
      "learning_rate": 1.921471959676957e-06,
      "loss": 0.4154,
      "step": 220
    },
    {
      "epoch": 0.9414270500532481,
      "grad_norm": 0.29039615392684937,
      "learning_rate": 1.657513243395159e-06,
      "loss": 0.4367,
      "step": 221
    },
    {
      "epoch": 0.9456869009584664,
      "grad_norm": 0.3864074647426605,
      "learning_rate": 1.4128981481764115e-06,
      "loss": 0.5531,
      "step": 222
    },
    {
      "epoch": 0.9499467518636848,
      "grad_norm": 0.4020756185054779,
      "learning_rate": 1.1876747889491223e-06,
      "loss": 0.6631,
      "step": 223
    },
    {
      "epoch": 0.9542066027689031,
      "grad_norm": 0.37475478649139404,
      "learning_rate": 9.818874663554357e-07,
      "loss": 0.4979,
      "step": 224
    },
    {
      "epoch": 0.9584664536741214,
      "grad_norm": 0.3484041690826416,
      "learning_rate": 7.955766580375335e-07,
      "loss": 0.5207,
      "step": 225
    },
    {
      "epoch": 0.9627263045793397,
      "grad_norm": 0.3385999798774719,
      "learning_rate": 6.287790106757396e-07,
      "loss": 0.4832,
      "step": 226
    },
    {
      "epoch": 0.966986155484558,
      "grad_norm": 0.27909693121910095,
      "learning_rate": 4.815273327803182e-07,
      "loss": 0.3506,
      "step": 227
    },
    {
      "epoch": 0.9712460063897763,
      "grad_norm": 0.34196606278419495,
      "learning_rate": 3.5385058823809156e-07,
      "loss": 0.5283,
      "step": 228
    },
    {
      "epoch": 0.9755058572949947,
      "grad_norm": 0.39571547508239746,
      "learning_rate": 2.457738906153972e-07,
      "loss": 0.5099,
      "step": 229
    },
    {
      "epoch": 0.979765708200213,
      "grad_norm": 0.4107287526130676,
      "learning_rate": 1.5731849821833954e-07,
      "loss": 0.5042,
      "step": 230
    },
    {
      "epoch": 0.9840255591054313,
      "grad_norm": 0.3254135251045227,
      "learning_rate": 8.850180991131219e-08,
      "loss": 0.4929,
      "step": 231
    },
    {
      "epoch": 0.9882854100106496,
      "grad_norm": 0.2778495252132416,
      "learning_rate": 3.933736169471347e-08,
      "loss": 0.3571,
      "step": 232
    },
    {
      "epoch": 0.9925452609158679,
      "grad_norm": 0.4703886806964874,
      "learning_rate": 9.834824042498358e-09,
      "loss": 0.7013,
      "step": 233
    },
    {
      "epoch": 0.9968051118210862,
      "grad_norm": 0.3349379599094391,
      "learning_rate": 0.0,
      "loss": 0.4725,
      "step": 234
    }
  ],
  "logging_steps": 1,
  "max_steps": 234,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.1191352754700288e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}