dolphin-2.9.4-llama3.1-8b / trainer_state.json
ehartford's picture
Upload folder using huggingface_hub
dc464da verified
raw
history blame
198 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996163752945689,
"eval_steps": 500,
"global_step": 1140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008768564695566394,
"grad_norm": 3.8354088038954104,
"learning_rate": 5.0000000000000004e-08,
"loss": 0.8827,
"step": 1
},
{
"epoch": 0.0017537129391132788,
"grad_norm": 3.854484535409196,
"learning_rate": 1.0000000000000001e-07,
"loss": 0.8816,
"step": 2
},
{
"epoch": 0.0026305694086699184,
"grad_norm": 3.871894613191576,
"learning_rate": 1.5000000000000002e-07,
"loss": 0.8801,
"step": 3
},
{
"epoch": 0.0035074258782265577,
"grad_norm": 4.015192807591418,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.8778,
"step": 4
},
{
"epoch": 0.004384282347783197,
"grad_norm": 3.8093684146898625,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.8711,
"step": 5
},
{
"epoch": 0.005261138817339837,
"grad_norm": 3.8610474891808035,
"learning_rate": 3.0000000000000004e-07,
"loss": 0.8774,
"step": 6
},
{
"epoch": 0.0061379952868964765,
"grad_norm": 3.7967273935876027,
"learning_rate": 3.5000000000000004e-07,
"loss": 0.8669,
"step": 7
},
{
"epoch": 0.007014851756453115,
"grad_norm": 3.6775126026184703,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.8605,
"step": 8
},
{
"epoch": 0.007891708226009755,
"grad_norm": 3.8340713786963674,
"learning_rate": 4.5000000000000003e-07,
"loss": 0.8735,
"step": 9
},
{
"epoch": 0.008768564695566394,
"grad_norm": 3.7479501504503463,
"learning_rate": 5.000000000000001e-07,
"loss": 0.8843,
"step": 10
},
{
"epoch": 0.009645421165123035,
"grad_norm": 3.6317203672346734,
"learning_rate": 5.5e-07,
"loss": 0.8637,
"step": 11
},
{
"epoch": 0.010522277634679673,
"grad_norm": 3.512911808429478,
"learning_rate": 6.000000000000001e-07,
"loss": 0.8649,
"step": 12
},
{
"epoch": 0.011399134104236312,
"grad_norm": 3.5056527507086486,
"learning_rate": 6.5e-07,
"loss": 0.8514,
"step": 13
},
{
"epoch": 0.012275990573792953,
"grad_norm": 3.150666271402955,
"learning_rate": 7.000000000000001e-07,
"loss": 0.844,
"step": 14
},
{
"epoch": 0.013152847043349592,
"grad_norm": 2.92608322776606,
"learning_rate": 7.5e-07,
"loss": 0.8382,
"step": 15
},
{
"epoch": 0.01402970351290623,
"grad_norm": 3.0202821236842246,
"learning_rate": 8.000000000000001e-07,
"loss": 0.8419,
"step": 16
},
{
"epoch": 0.014906559982462871,
"grad_norm": 2.9419098502173515,
"learning_rate": 8.500000000000001e-07,
"loss": 0.8362,
"step": 17
},
{
"epoch": 0.01578341645201951,
"grad_norm": 2.7926753613205433,
"learning_rate": 9.000000000000001e-07,
"loss": 0.825,
"step": 18
},
{
"epoch": 0.01666027292157615,
"grad_norm": 2.4471605086654096,
"learning_rate": 9.500000000000001e-07,
"loss": 0.7904,
"step": 19
},
{
"epoch": 0.017537129391132788,
"grad_norm": 1.8918627793518321,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.7968,
"step": 20
},
{
"epoch": 0.018413985860689427,
"grad_norm": 1.713937144355921,
"learning_rate": 1.0500000000000001e-06,
"loss": 0.7828,
"step": 21
},
{
"epoch": 0.01929084233024607,
"grad_norm": 1.4451729443975803,
"learning_rate": 1.1e-06,
"loss": 0.78,
"step": 22
},
{
"epoch": 0.020167698799802708,
"grad_norm": 1.0866085026095695,
"learning_rate": 1.1500000000000002e-06,
"loss": 0.7807,
"step": 23
},
{
"epoch": 0.021044555269359347,
"grad_norm": 1.022948274017058,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.758,
"step": 24
},
{
"epoch": 0.021921411738915986,
"grad_norm": 0.976807823206357,
"learning_rate": 1.25e-06,
"loss": 0.7783,
"step": 25
},
{
"epoch": 0.022798268208472625,
"grad_norm": 2.5562950715507275,
"learning_rate": 1.3e-06,
"loss": 0.7815,
"step": 26
},
{
"epoch": 0.023675124678029263,
"grad_norm": 1.7956421603987698,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.759,
"step": 27
},
{
"epoch": 0.024551981147585906,
"grad_norm": 1.3622207205502601,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.7551,
"step": 28
},
{
"epoch": 0.025428837617142545,
"grad_norm": 0.9842354354215974,
"learning_rate": 1.45e-06,
"loss": 0.7625,
"step": 29
},
{
"epoch": 0.026305694086699184,
"grad_norm": 0.7679059075291825,
"learning_rate": 1.5e-06,
"loss": 0.7513,
"step": 30
},
{
"epoch": 0.027182550556255822,
"grad_norm": 0.709914193704945,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.7309,
"step": 31
},
{
"epoch": 0.02805940702581246,
"grad_norm": 0.5711165082308596,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.7358,
"step": 32
},
{
"epoch": 0.0289362634953691,
"grad_norm": 0.6732600160748007,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.746,
"step": 33
},
{
"epoch": 0.029813119964925743,
"grad_norm": 0.519623223105866,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.7408,
"step": 34
},
{
"epoch": 0.03068997643448238,
"grad_norm": 0.4967853550459734,
"learning_rate": 1.75e-06,
"loss": 0.7284,
"step": 35
},
{
"epoch": 0.03156683290403902,
"grad_norm": 0.4558474579400771,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.7337,
"step": 36
},
{
"epoch": 0.03244368937359566,
"grad_norm": 0.5187940265183988,
"learning_rate": 1.85e-06,
"loss": 0.7459,
"step": 37
},
{
"epoch": 0.0333205458431523,
"grad_norm": 0.46649520265418404,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.7238,
"step": 38
},
{
"epoch": 0.03419740231270894,
"grad_norm": 0.4621107554297482,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.7243,
"step": 39
},
{
"epoch": 0.035074258782265576,
"grad_norm": 0.4493723053379801,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7395,
"step": 40
},
{
"epoch": 0.035951115251822215,
"grad_norm": 0.4196555282378131,
"learning_rate": 2.05e-06,
"loss": 0.7371,
"step": 41
},
{
"epoch": 0.036827971721378853,
"grad_norm": 0.3836269605839978,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.7172,
"step": 42
},
{
"epoch": 0.0377048281909355,
"grad_norm": 0.38056806308372326,
"learning_rate": 2.15e-06,
"loss": 0.7163,
"step": 43
},
{
"epoch": 0.03858168466049214,
"grad_norm": 0.3561457145290273,
"learning_rate": 2.2e-06,
"loss": 0.6986,
"step": 44
},
{
"epoch": 0.03945854113004878,
"grad_norm": 0.3723153937166507,
"learning_rate": 2.25e-06,
"loss": 0.7154,
"step": 45
},
{
"epoch": 0.040335397599605416,
"grad_norm": 0.36630666691552083,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.7201,
"step": 46
},
{
"epoch": 0.041212254069162055,
"grad_norm": 0.3482645877468935,
"learning_rate": 2.35e-06,
"loss": 0.7213,
"step": 47
},
{
"epoch": 0.042089110538718694,
"grad_norm": 0.35892687942862245,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.7167,
"step": 48
},
{
"epoch": 0.04296596700827533,
"grad_norm": 0.3353339246028489,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.7154,
"step": 49
},
{
"epoch": 0.04384282347783197,
"grad_norm": 0.3327601533732165,
"learning_rate": 2.5e-06,
"loss": 0.7149,
"step": 50
},
{
"epoch": 0.04471967994738861,
"grad_norm": 0.31047839521651305,
"learning_rate": 2.55e-06,
"loss": 0.7022,
"step": 51
},
{
"epoch": 0.04559653641694525,
"grad_norm": 0.3140715368302216,
"learning_rate": 2.6e-06,
"loss": 0.7024,
"step": 52
},
{
"epoch": 0.04647339288650189,
"grad_norm": 0.3070088967685052,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.7116,
"step": 53
},
{
"epoch": 0.04735024935605853,
"grad_norm": 0.29688015435603987,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.7068,
"step": 54
},
{
"epoch": 0.04822710582561517,
"grad_norm": 0.312569173156887,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.708,
"step": 55
},
{
"epoch": 0.04910396229517181,
"grad_norm": 0.3212155084231398,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.6895,
"step": 56
},
{
"epoch": 0.04998081876472845,
"grad_norm": 0.30141336197411556,
"learning_rate": 2.85e-06,
"loss": 0.714,
"step": 57
},
{
"epoch": 0.05085767523428509,
"grad_norm": 0.2678799864293998,
"learning_rate": 2.9e-06,
"loss": 0.6864,
"step": 58
},
{
"epoch": 0.05173453170384173,
"grad_norm": 0.2763602360222888,
"learning_rate": 2.95e-06,
"loss": 0.6955,
"step": 59
},
{
"epoch": 0.05261138817339837,
"grad_norm": 0.2960116429627635,
"learning_rate": 3e-06,
"loss": 0.69,
"step": 60
},
{
"epoch": 0.053488244642955006,
"grad_norm": 0.3126860845251708,
"learning_rate": 3.05e-06,
"loss": 0.7008,
"step": 61
},
{
"epoch": 0.054365101112511645,
"grad_norm": 0.2684477743603555,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.7065,
"step": 62
},
{
"epoch": 0.055241957582068284,
"grad_norm": 0.2831279869843839,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.6908,
"step": 63
},
{
"epoch": 0.05611881405162492,
"grad_norm": 0.28914936357131454,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.6847,
"step": 64
},
{
"epoch": 0.05699567052118156,
"grad_norm": 0.2664694092243829,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.6975,
"step": 65
},
{
"epoch": 0.0578725269907382,
"grad_norm": 0.2670931319561963,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.6957,
"step": 66
},
{
"epoch": 0.058749383460294846,
"grad_norm": 0.25481964712146327,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.6907,
"step": 67
},
{
"epoch": 0.059626239929851485,
"grad_norm": 0.2917224006438053,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.6889,
"step": 68
},
{
"epoch": 0.060503096399408124,
"grad_norm": 0.27794604488949715,
"learning_rate": 3.45e-06,
"loss": 0.6815,
"step": 69
},
{
"epoch": 0.06137995286896476,
"grad_norm": 0.24963117175569036,
"learning_rate": 3.5e-06,
"loss": 0.6883,
"step": 70
},
{
"epoch": 0.0622568093385214,
"grad_norm": 0.2893133633641976,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.6792,
"step": 71
},
{
"epoch": 0.06313366580807804,
"grad_norm": 0.2826308836822568,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.7028,
"step": 72
},
{
"epoch": 0.06401052227763468,
"grad_norm": 0.2640935466003184,
"learning_rate": 3.65e-06,
"loss": 0.6916,
"step": 73
},
{
"epoch": 0.06488737874719132,
"grad_norm": 0.24415033172628944,
"learning_rate": 3.7e-06,
"loss": 0.6839,
"step": 74
},
{
"epoch": 0.06576423521674796,
"grad_norm": 0.3112401087242733,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.7021,
"step": 75
},
{
"epoch": 0.0666410916863046,
"grad_norm": 0.2875281112172732,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.6929,
"step": 76
},
{
"epoch": 0.06751794815586123,
"grad_norm": 0.2874092373703745,
"learning_rate": 3.85e-06,
"loss": 0.6788,
"step": 77
},
{
"epoch": 0.06839480462541787,
"grad_norm": 0.26681007920352356,
"learning_rate": 3.900000000000001e-06,
"loss": 0.6881,
"step": 78
},
{
"epoch": 0.06927166109497451,
"grad_norm": 0.25207102904583284,
"learning_rate": 3.95e-06,
"loss": 0.6852,
"step": 79
},
{
"epoch": 0.07014851756453115,
"grad_norm": 0.2747607135538642,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6864,
"step": 80
},
{
"epoch": 0.07102537403408779,
"grad_norm": 0.26361955079133653,
"learning_rate": 4.05e-06,
"loss": 0.685,
"step": 81
},
{
"epoch": 0.07190223050364443,
"grad_norm": 0.33310729956901713,
"learning_rate": 4.1e-06,
"loss": 0.6803,
"step": 82
},
{
"epoch": 0.07277908697320107,
"grad_norm": 0.2453664087918243,
"learning_rate": 4.15e-06,
"loss": 0.6761,
"step": 83
},
{
"epoch": 0.07365594344275771,
"grad_norm": 0.2908734202511105,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.6931,
"step": 84
},
{
"epoch": 0.07453279991231436,
"grad_norm": 0.2786719287704165,
"learning_rate": 4.25e-06,
"loss": 0.6874,
"step": 85
},
{
"epoch": 0.075409656381871,
"grad_norm": 0.271512101257661,
"learning_rate": 4.3e-06,
"loss": 0.6775,
"step": 86
},
{
"epoch": 0.07628651285142764,
"grad_norm": 0.2947304767213564,
"learning_rate": 4.350000000000001e-06,
"loss": 0.6865,
"step": 87
},
{
"epoch": 0.07716336932098428,
"grad_norm": 0.25160176616217883,
"learning_rate": 4.4e-06,
"loss": 0.6785,
"step": 88
},
{
"epoch": 0.07804022579054092,
"grad_norm": 0.32459153781403244,
"learning_rate": 4.450000000000001e-06,
"loss": 0.6773,
"step": 89
},
{
"epoch": 0.07891708226009755,
"grad_norm": 0.2487028104553641,
"learning_rate": 4.5e-06,
"loss": 0.6812,
"step": 90
},
{
"epoch": 0.07979393872965419,
"grad_norm": 0.2925038544983962,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.6791,
"step": 91
},
{
"epoch": 0.08067079519921083,
"grad_norm": 0.28005649996035475,
"learning_rate": 4.600000000000001e-06,
"loss": 0.6704,
"step": 92
},
{
"epoch": 0.08154765166876747,
"grad_norm": 0.3264776457957641,
"learning_rate": 4.65e-06,
"loss": 0.6772,
"step": 93
},
{
"epoch": 0.08242450813832411,
"grad_norm": 0.2533079586966528,
"learning_rate": 4.7e-06,
"loss": 0.6792,
"step": 94
},
{
"epoch": 0.08330136460788075,
"grad_norm": 0.25651763696878965,
"learning_rate": 4.75e-06,
"loss": 0.6607,
"step": 95
},
{
"epoch": 0.08417822107743739,
"grad_norm": 0.2546288408258964,
"learning_rate": 4.800000000000001e-06,
"loss": 0.6669,
"step": 96
},
{
"epoch": 0.08505507754699403,
"grad_norm": 0.25215356470309513,
"learning_rate": 4.85e-06,
"loss": 0.6846,
"step": 97
},
{
"epoch": 0.08593193401655067,
"grad_norm": 0.28631928221309494,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.6717,
"step": 98
},
{
"epoch": 0.0868087904861073,
"grad_norm": 0.27212851090592044,
"learning_rate": 4.95e-06,
"loss": 0.6804,
"step": 99
},
{
"epoch": 0.08768564695566394,
"grad_norm": 0.29348118762199116,
"learning_rate": 5e-06,
"loss": 0.669,
"step": 100
},
{
"epoch": 0.08856250342522058,
"grad_norm": 0.30678288402779474,
"learning_rate": 4.999998880733363e-06,
"loss": 0.6631,
"step": 101
},
{
"epoch": 0.08943935989477722,
"grad_norm": 0.3011120934546324,
"learning_rate": 4.999995522934454e-06,
"loss": 0.679,
"step": 102
},
{
"epoch": 0.09031621636433386,
"grad_norm": 0.31706623056013666,
"learning_rate": 4.9999899266062804e-06,
"loss": 0.6723,
"step": 103
},
{
"epoch": 0.0911930728338905,
"grad_norm": 0.3120471729111099,
"learning_rate": 4.999982091753851e-06,
"loss": 0.6613,
"step": 104
},
{
"epoch": 0.09206992930344714,
"grad_norm": 0.2905613969012575,
"learning_rate": 4.999972018384183e-06,
"loss": 0.6611,
"step": 105
},
{
"epoch": 0.09294678577300378,
"grad_norm": 0.28925318733211003,
"learning_rate": 4.999959706506297e-06,
"loss": 0.6695,
"step": 106
},
{
"epoch": 0.09382364224256041,
"grad_norm": 0.28085987028825943,
"learning_rate": 4.999945156131215e-06,
"loss": 0.6502,
"step": 107
},
{
"epoch": 0.09470049871211705,
"grad_norm": 0.30971852568333075,
"learning_rate": 4.9999283672719665e-06,
"loss": 0.672,
"step": 108
},
{
"epoch": 0.0955773551816737,
"grad_norm": 0.32363303577963826,
"learning_rate": 4.999909339943585e-06,
"loss": 0.673,
"step": 109
},
{
"epoch": 0.09645421165123035,
"grad_norm": 0.29549042512555623,
"learning_rate": 4.999888074163108e-06,
"loss": 0.6591,
"step": 110
},
{
"epoch": 0.09733106812078698,
"grad_norm": 0.33514032815726946,
"learning_rate": 4.999864569949576e-06,
"loss": 0.6673,
"step": 111
},
{
"epoch": 0.09820792459034362,
"grad_norm": 0.3092438114721304,
"learning_rate": 4.999838827324036e-06,
"loss": 0.6641,
"step": 112
},
{
"epoch": 0.09908478105990026,
"grad_norm": 0.35403209993563217,
"learning_rate": 4.999810846309539e-06,
"loss": 0.6597,
"step": 113
},
{
"epoch": 0.0999616375294569,
"grad_norm": 0.2964896689419525,
"learning_rate": 4.999780626931136e-06,
"loss": 0.67,
"step": 114
},
{
"epoch": 0.10083849399901354,
"grad_norm": 0.3484706075226941,
"learning_rate": 4.999748169215891e-06,
"loss": 0.6745,
"step": 115
},
{
"epoch": 0.10171535046857018,
"grad_norm": 0.33505074735981694,
"learning_rate": 4.999713473192863e-06,
"loss": 0.6591,
"step": 116
},
{
"epoch": 0.10259220693812682,
"grad_norm": 0.27082614750107925,
"learning_rate": 4.999676538893121e-06,
"loss": 0.6621,
"step": 117
},
{
"epoch": 0.10346906340768346,
"grad_norm": 0.3506965847465109,
"learning_rate": 4.999637366349736e-06,
"loss": 0.6733,
"step": 118
},
{
"epoch": 0.1043459198772401,
"grad_norm": 0.27422374937685745,
"learning_rate": 4.999595955597784e-06,
"loss": 0.655,
"step": 119
},
{
"epoch": 0.10522277634679673,
"grad_norm": 0.33620430443399,
"learning_rate": 4.999552306674345e-06,
"loss": 0.6755,
"step": 120
},
{
"epoch": 0.10609963281635337,
"grad_norm": 0.2837804889330797,
"learning_rate": 4.999506419618502e-06,
"loss": 0.6579,
"step": 121
},
{
"epoch": 0.10697648928591001,
"grad_norm": 0.37952040871876175,
"learning_rate": 4.999458294471342e-06,
"loss": 0.6692,
"step": 122
},
{
"epoch": 0.10785334575546665,
"grad_norm": 0.2690864525050558,
"learning_rate": 4.99940793127596e-06,
"loss": 0.6494,
"step": 123
},
{
"epoch": 0.10873020222502329,
"grad_norm": 0.3635002166658454,
"learning_rate": 4.999355330077449e-06,
"loss": 0.6611,
"step": 124
},
{
"epoch": 0.10960705869457993,
"grad_norm": 0.29302462194523843,
"learning_rate": 4.999300490922911e-06,
"loss": 0.6526,
"step": 125
},
{
"epoch": 0.11048391516413657,
"grad_norm": 0.3058787861740299,
"learning_rate": 4.999243413861447e-06,
"loss": 0.659,
"step": 126
},
{
"epoch": 0.1113607716336932,
"grad_norm": 0.332548080761125,
"learning_rate": 4.9991840989441665e-06,
"loss": 0.6659,
"step": 127
},
{
"epoch": 0.11223762810324985,
"grad_norm": 0.29432766212441813,
"learning_rate": 4.999122546224181e-06,
"loss": 0.6447,
"step": 128
},
{
"epoch": 0.11311448457280648,
"grad_norm": 0.29523416391879537,
"learning_rate": 4.999058755756605e-06,
"loss": 0.6587,
"step": 129
},
{
"epoch": 0.11399134104236312,
"grad_norm": 0.32423165831626255,
"learning_rate": 4.998992727598557e-06,
"loss": 0.6564,
"step": 130
},
{
"epoch": 0.11486819751191976,
"grad_norm": 0.34859884756639065,
"learning_rate": 4.99892446180916e-06,
"loss": 0.653,
"step": 131
},
{
"epoch": 0.1157450539814764,
"grad_norm": 0.30133447855543133,
"learning_rate": 4.99885395844954e-06,
"loss": 0.647,
"step": 132
},
{
"epoch": 0.11662191045103305,
"grad_norm": 0.3600942516700186,
"learning_rate": 4.998781217582827e-06,
"loss": 0.6581,
"step": 133
},
{
"epoch": 0.11749876692058969,
"grad_norm": 0.29960571448156953,
"learning_rate": 4.998706239274153e-06,
"loss": 0.6623,
"step": 134
},
{
"epoch": 0.11837562339014633,
"grad_norm": 0.2992208264370026,
"learning_rate": 4.998629023590656e-06,
"loss": 0.6538,
"step": 135
},
{
"epoch": 0.11925247985970297,
"grad_norm": 0.36522912538035174,
"learning_rate": 4.998549570601475e-06,
"loss": 0.6566,
"step": 136
},
{
"epoch": 0.12012933632925961,
"grad_norm": 0.2988448634710597,
"learning_rate": 4.998467880377754e-06,
"loss": 0.673,
"step": 137
},
{
"epoch": 0.12100619279881625,
"grad_norm": 0.32912250244162505,
"learning_rate": 4.998383952992639e-06,
"loss": 0.6482,
"step": 138
},
{
"epoch": 0.12188304926837289,
"grad_norm": 0.37178534793553225,
"learning_rate": 4.998297788521279e-06,
"loss": 0.6546,
"step": 139
},
{
"epoch": 0.12275990573792953,
"grad_norm": 0.28062782891296695,
"learning_rate": 4.998209387040829e-06,
"loss": 0.6527,
"step": 140
},
{
"epoch": 0.12363676220748616,
"grad_norm": 0.33723394797540485,
"learning_rate": 4.998118748630443e-06,
"loss": 0.6391,
"step": 141
},
{
"epoch": 0.1245136186770428,
"grad_norm": 0.2834572318610097,
"learning_rate": 4.99802587337128e-06,
"loss": 0.6443,
"step": 142
},
{
"epoch": 0.12539047514659943,
"grad_norm": 0.321495289367043,
"learning_rate": 4.997930761346502e-06,
"loss": 0.6507,
"step": 143
},
{
"epoch": 0.12626733161615608,
"grad_norm": 0.3419910878952078,
"learning_rate": 4.997833412641274e-06,
"loss": 0.6543,
"step": 144
},
{
"epoch": 0.1271441880857127,
"grad_norm": 0.28772221770446305,
"learning_rate": 4.9977338273427625e-06,
"loss": 0.6522,
"step": 145
},
{
"epoch": 0.12802104455526936,
"grad_norm": 0.29706932671928316,
"learning_rate": 4.997632005540139e-06,
"loss": 0.6677,
"step": 146
},
{
"epoch": 0.128897901024826,
"grad_norm": 0.29918610448467253,
"learning_rate": 4.997527947324573e-06,
"loss": 0.6475,
"step": 147
},
{
"epoch": 0.12977475749438264,
"grad_norm": 0.33103419851925103,
"learning_rate": 4.997421652789243e-06,
"loss": 0.67,
"step": 148
},
{
"epoch": 0.1306516139639393,
"grad_norm": 0.27012500247528487,
"learning_rate": 4.9973131220293255e-06,
"loss": 0.647,
"step": 149
},
{
"epoch": 0.13152847043349591,
"grad_norm": 0.297677443804652,
"learning_rate": 4.9972023551419995e-06,
"loss": 0.6519,
"step": 150
},
{
"epoch": 0.13240532690305257,
"grad_norm": 0.27386600476743567,
"learning_rate": 4.997089352226448e-06,
"loss": 0.6562,
"step": 151
},
{
"epoch": 0.1332821833726092,
"grad_norm": 0.3025435071675535,
"learning_rate": 4.996974113383854e-06,
"loss": 0.6485,
"step": 152
},
{
"epoch": 0.13415903984216584,
"grad_norm": 0.2928572797854547,
"learning_rate": 4.996856638717406e-06,
"loss": 0.641,
"step": 153
},
{
"epoch": 0.13503589631172247,
"grad_norm": 0.28232417223789874,
"learning_rate": 4.996736928332292e-06,
"loss": 0.6358,
"step": 154
},
{
"epoch": 0.13591275278127912,
"grad_norm": 0.33877806926878856,
"learning_rate": 4.9966149823357e-06,
"loss": 0.6558,
"step": 155
},
{
"epoch": 0.13678960925083575,
"grad_norm": 0.27274924720742,
"learning_rate": 4.996490800836825e-06,
"loss": 0.6553,
"step": 156
},
{
"epoch": 0.1376664657203924,
"grad_norm": 0.3145522020468823,
"learning_rate": 4.996364383946859e-06,
"loss": 0.6458,
"step": 157
},
{
"epoch": 0.13854332218994903,
"grad_norm": 0.28298098932682264,
"learning_rate": 4.996235731778997e-06,
"loss": 0.6467,
"step": 158
},
{
"epoch": 0.13942017865950568,
"grad_norm": 0.3289393703740858,
"learning_rate": 4.996104844448438e-06,
"loss": 0.6522,
"step": 159
},
{
"epoch": 0.1402970351290623,
"grad_norm": 0.3242491154179804,
"learning_rate": 4.995971722072379e-06,
"loss": 0.6579,
"step": 160
},
{
"epoch": 0.14117389159861896,
"grad_norm": 0.350063023556927,
"learning_rate": 4.995836364770018e-06,
"loss": 0.6639,
"step": 161
},
{
"epoch": 0.14205074806817558,
"grad_norm": 0.26800977502782475,
"learning_rate": 4.995698772662558e-06,
"loss": 0.6564,
"step": 162
},
{
"epoch": 0.14292760453773223,
"grad_norm": 0.37123972908338404,
"learning_rate": 4.9955589458732e-06,
"loss": 0.6521,
"step": 163
},
{
"epoch": 0.14380446100728886,
"grad_norm": 0.25568101611736427,
"learning_rate": 4.995416884527147e-06,
"loss": 0.6489,
"step": 164
},
{
"epoch": 0.1446813174768455,
"grad_norm": 0.3502739955437778,
"learning_rate": 4.9952725887516015e-06,
"loss": 0.6389,
"step": 165
},
{
"epoch": 0.14555817394640214,
"grad_norm": 0.2695951493086468,
"learning_rate": 4.99512605867577e-06,
"loss": 0.6409,
"step": 166
},
{
"epoch": 0.1464350304159588,
"grad_norm": 0.33224546665642934,
"learning_rate": 4.994977294430856e-06,
"loss": 0.6478,
"step": 167
},
{
"epoch": 0.14731188688551541,
"grad_norm": 0.26336591640433304,
"learning_rate": 4.994826296150064e-06,
"loss": 0.6416,
"step": 168
},
{
"epoch": 0.14818874335507207,
"grad_norm": 0.3158628283831438,
"learning_rate": 4.9946730639686025e-06,
"loss": 0.6397,
"step": 169
},
{
"epoch": 0.14906559982462872,
"grad_norm": 0.29572803602407627,
"learning_rate": 4.9945175980236745e-06,
"loss": 0.6356,
"step": 170
},
{
"epoch": 0.14994245629418534,
"grad_norm": 0.3344536076519792,
"learning_rate": 4.99435989845449e-06,
"loss": 0.6494,
"step": 171
},
{
"epoch": 0.150819312763742,
"grad_norm": 0.2811402499936693,
"learning_rate": 4.994199965402252e-06,
"loss": 0.6472,
"step": 172
},
{
"epoch": 0.15169616923329862,
"grad_norm": 0.30351530565920815,
"learning_rate": 4.994037799010168e-06,
"loss": 0.6514,
"step": 173
},
{
"epoch": 0.15257302570285527,
"grad_norm": 0.2667020904201129,
"learning_rate": 4.993873399423445e-06,
"loss": 0.642,
"step": 174
},
{
"epoch": 0.1534498821724119,
"grad_norm": 0.3062654941965369,
"learning_rate": 4.993706766789287e-06,
"loss": 0.6398,
"step": 175
},
{
"epoch": 0.15432673864196855,
"grad_norm": 0.28228507467929365,
"learning_rate": 4.993537901256898e-06,
"loss": 0.6446,
"step": 176
},
{
"epoch": 0.15520359511152518,
"grad_norm": 0.3157908119401443,
"learning_rate": 4.993366802977486e-06,
"loss": 0.645,
"step": 177
},
{
"epoch": 0.15608045158108183,
"grad_norm": 0.29612114085869035,
"learning_rate": 4.993193472104253e-06,
"loss": 0.6379,
"step": 178
},
{
"epoch": 0.15695730805063846,
"grad_norm": 0.31715005105530436,
"learning_rate": 4.9930179087924e-06,
"loss": 0.6446,
"step": 179
},
{
"epoch": 0.1578341645201951,
"grad_norm": 0.3010974405602859,
"learning_rate": 4.992840113199131e-06,
"loss": 0.6273,
"step": 180
},
{
"epoch": 0.15871102098975173,
"grad_norm": 0.3097310667014726,
"learning_rate": 4.992660085483645e-06,
"loss": 0.6477,
"step": 181
},
{
"epoch": 0.15958787745930839,
"grad_norm": 0.25428924204211556,
"learning_rate": 4.992477825807142e-06,
"loss": 0.6562,
"step": 182
},
{
"epoch": 0.160464733928865,
"grad_norm": 0.30870425916577926,
"learning_rate": 4.992293334332821e-06,
"loss": 0.6528,
"step": 183
},
{
"epoch": 0.16134159039842166,
"grad_norm": 0.2915653234864446,
"learning_rate": 4.992106611225875e-06,
"loss": 0.6491,
"step": 184
},
{
"epoch": 0.1622184468679783,
"grad_norm": 0.3032380988277513,
"learning_rate": 4.991917656653501e-06,
"loss": 0.6523,
"step": 185
},
{
"epoch": 0.16309530333753494,
"grad_norm": 0.2986663700583823,
"learning_rate": 4.991726470784891e-06,
"loss": 0.6333,
"step": 186
},
{
"epoch": 0.16397215980709157,
"grad_norm": 0.28321065505069615,
"learning_rate": 4.9915330537912346e-06,
"loss": 0.6411,
"step": 187
},
{
"epoch": 0.16484901627664822,
"grad_norm": 0.358610834369166,
"learning_rate": 4.99133740584572e-06,
"loss": 0.6404,
"step": 188
},
{
"epoch": 0.16572587274620484,
"grad_norm": 0.30976208589225795,
"learning_rate": 4.991139527123534e-06,
"loss": 0.6405,
"step": 189
},
{
"epoch": 0.1666027292157615,
"grad_norm": 0.34149502314365515,
"learning_rate": 4.990939417801859e-06,
"loss": 0.6384,
"step": 190
},
{
"epoch": 0.16747958568531812,
"grad_norm": 0.2959951500432587,
"learning_rate": 4.9907370780598754e-06,
"loss": 0.6469,
"step": 191
},
{
"epoch": 0.16835644215487477,
"grad_norm": 0.3302476980977895,
"learning_rate": 4.990532508078761e-06,
"loss": 0.6359,
"step": 192
},
{
"epoch": 0.1692332986244314,
"grad_norm": 0.3944297035939378,
"learning_rate": 4.990325708041691e-06,
"loss": 0.6502,
"step": 193
},
{
"epoch": 0.17011015509398805,
"grad_norm": 0.360231124267091,
"learning_rate": 4.990116678133836e-06,
"loss": 0.6424,
"step": 194
},
{
"epoch": 0.1709870115635447,
"grad_norm": 0.33832741778437936,
"learning_rate": 4.989905418542366e-06,
"loss": 0.6352,
"step": 195
},
{
"epoch": 0.17186386803310133,
"grad_norm": 0.36238295597291414,
"learning_rate": 4.989691929456443e-06,
"loss": 0.6499,
"step": 196
},
{
"epoch": 0.17274072450265798,
"grad_norm": 0.32684488652867627,
"learning_rate": 4.98947621106723e-06,
"loss": 0.6475,
"step": 197
},
{
"epoch": 0.1736175809722146,
"grad_norm": 0.2757346118610075,
"learning_rate": 4.989258263567884e-06,
"loss": 0.6355,
"step": 198
},
{
"epoch": 0.17449443744177126,
"grad_norm": 0.29755713041423115,
"learning_rate": 4.989038087153556e-06,
"loss": 0.6336,
"step": 199
},
{
"epoch": 0.17537129391132789,
"grad_norm": 0.29151765698243737,
"learning_rate": 4.988815682021398e-06,
"loss": 0.6471,
"step": 200
},
{
"epoch": 0.17624815038088454,
"grad_norm": 0.28111823253643253,
"learning_rate": 4.988591048370552e-06,
"loss": 0.6407,
"step": 201
},
{
"epoch": 0.17712500685044116,
"grad_norm": 0.2656165957748681,
"learning_rate": 4.988364186402159e-06,
"loss": 0.6326,
"step": 202
},
{
"epoch": 0.17800186331999782,
"grad_norm": 0.3028986715129606,
"learning_rate": 4.988135096319355e-06,
"loss": 0.6348,
"step": 203
},
{
"epoch": 0.17887871978955444,
"grad_norm": 0.29924585956112065,
"learning_rate": 4.987903778327269e-06,
"loss": 0.6488,
"step": 204
},
{
"epoch": 0.1797555762591111,
"grad_norm": 0.2747438588784908,
"learning_rate": 4.987670232633027e-06,
"loss": 0.6353,
"step": 205
},
{
"epoch": 0.18063243272866772,
"grad_norm": 0.30887265845064044,
"learning_rate": 4.987434459445748e-06,
"loss": 0.6428,
"step": 206
},
{
"epoch": 0.18150928919822437,
"grad_norm": 0.3193061834187564,
"learning_rate": 4.987196458976548e-06,
"loss": 0.6467,
"step": 207
},
{
"epoch": 0.182386145667781,
"grad_norm": 0.2769424032566695,
"learning_rate": 4.9869562314385335e-06,
"loss": 0.6407,
"step": 208
},
{
"epoch": 0.18326300213733765,
"grad_norm": 0.3406015148633883,
"learning_rate": 4.986713777046809e-06,
"loss": 0.6443,
"step": 209
},
{
"epoch": 0.18413985860689427,
"grad_norm": 0.271878066659463,
"learning_rate": 4.986469096018472e-06,
"loss": 0.6328,
"step": 210
},
{
"epoch": 0.18501671507645093,
"grad_norm": 0.2987491049335003,
"learning_rate": 4.9862221885726115e-06,
"loss": 0.6478,
"step": 211
},
{
"epoch": 0.18589357154600755,
"grad_norm": 0.3087618217189243,
"learning_rate": 4.985973054930313e-06,
"loss": 0.6363,
"step": 212
},
{
"epoch": 0.1867704280155642,
"grad_norm": 0.28612704652497223,
"learning_rate": 4.985721695314653e-06,
"loss": 0.6409,
"step": 213
},
{
"epoch": 0.18764728448512083,
"grad_norm": 0.26033127989473615,
"learning_rate": 4.985468109950704e-06,
"loss": 0.6495,
"step": 214
},
{
"epoch": 0.18852414095467748,
"grad_norm": 0.29345494621139656,
"learning_rate": 4.985212299065528e-06,
"loss": 0.648,
"step": 215
},
{
"epoch": 0.1894009974242341,
"grad_norm": 0.30811406203792147,
"learning_rate": 4.984954262888182e-06,
"loss": 0.639,
"step": 216
},
{
"epoch": 0.19027785389379076,
"grad_norm": 0.3312828084167346,
"learning_rate": 4.9846940016497146e-06,
"loss": 0.6403,
"step": 217
},
{
"epoch": 0.1911547103633474,
"grad_norm": 0.29106752415257064,
"learning_rate": 4.984431515583169e-06,
"loss": 0.6457,
"step": 218
},
{
"epoch": 0.19203156683290404,
"grad_norm": 0.2950307203873666,
"learning_rate": 4.984166804923576e-06,
"loss": 0.6366,
"step": 219
},
{
"epoch": 0.1929084233024607,
"grad_norm": 0.33001978484003053,
"learning_rate": 4.983899869907963e-06,
"loss": 0.6519,
"step": 220
},
{
"epoch": 0.19378527977201732,
"grad_norm": 0.25712182858786903,
"learning_rate": 4.983630710775346e-06,
"loss": 0.6302,
"step": 221
},
{
"epoch": 0.19466213624157397,
"grad_norm": 0.33700258932320354,
"learning_rate": 4.983359327766735e-06,
"loss": 0.6382,
"step": 222
},
{
"epoch": 0.1955389927111306,
"grad_norm": 0.3195952299259763,
"learning_rate": 4.983085721125128e-06,
"loss": 0.6408,
"step": 223
},
{
"epoch": 0.19641584918068725,
"grad_norm": 0.2820582636542398,
"learning_rate": 4.982809891095519e-06,
"loss": 0.6196,
"step": 224
},
{
"epoch": 0.19729270565024387,
"grad_norm": 0.30343326038998625,
"learning_rate": 4.982531837924887e-06,
"loss": 0.6361,
"step": 225
},
{
"epoch": 0.19816956211980052,
"grad_norm": 0.2724213298701267,
"learning_rate": 4.9822515618622055e-06,
"loss": 0.6455,
"step": 226
},
{
"epoch": 0.19904641858935715,
"grad_norm": 0.28433275446155476,
"learning_rate": 4.9819690631584375e-06,
"loss": 0.6329,
"step": 227
},
{
"epoch": 0.1999232750589138,
"grad_norm": 0.2641523923467397,
"learning_rate": 4.981684342066536e-06,
"loss": 0.6301,
"step": 228
},
{
"epoch": 0.20080013152847043,
"grad_norm": 0.29243768749633176,
"learning_rate": 4.9813973988414454e-06,
"loss": 0.6369,
"step": 229
},
{
"epoch": 0.20167698799802708,
"grad_norm": 0.27139535071517695,
"learning_rate": 4.981108233740096e-06,
"loss": 0.6279,
"step": 230
},
{
"epoch": 0.2025538444675837,
"grad_norm": 0.27525475223350887,
"learning_rate": 4.980816847021412e-06,
"loss": 0.6429,
"step": 231
},
{
"epoch": 0.20343070093714036,
"grad_norm": 0.3427701449667448,
"learning_rate": 4.980523238946304e-06,
"loss": 0.6438,
"step": 232
},
{
"epoch": 0.20430755740669698,
"grad_norm": 0.2574596630900604,
"learning_rate": 4.980227409777673e-06,
"loss": 0.6278,
"step": 233
},
{
"epoch": 0.20518441387625364,
"grad_norm": 0.3069435432493287,
"learning_rate": 4.9799293597804086e-06,
"loss": 0.645,
"step": 234
},
{
"epoch": 0.20606127034581026,
"grad_norm": 0.2861360169316533,
"learning_rate": 4.979629089221387e-06,
"loss": 0.646,
"step": 235
},
{
"epoch": 0.2069381268153669,
"grad_norm": 0.258606470239814,
"learning_rate": 4.9793265983694775e-06,
"loss": 0.638,
"step": 236
},
{
"epoch": 0.20781498328492354,
"grad_norm": 0.2852233202848665,
"learning_rate": 4.9790218874955325e-06,
"loss": 0.6233,
"step": 237
},
{
"epoch": 0.2086918397544802,
"grad_norm": 0.27593128237727194,
"learning_rate": 4.978714956872394e-06,
"loss": 0.64,
"step": 238
},
{
"epoch": 0.20956869622403682,
"grad_norm": 0.2721892419938629,
"learning_rate": 4.978405806774892e-06,
"loss": 0.6242,
"step": 239
},
{
"epoch": 0.21044555269359347,
"grad_norm": 0.26477694173686633,
"learning_rate": 4.978094437479843e-06,
"loss": 0.6409,
"step": 240
},
{
"epoch": 0.2113224091631501,
"grad_norm": 0.29511740452877416,
"learning_rate": 4.977780849266054e-06,
"loss": 0.6397,
"step": 241
},
{
"epoch": 0.21219926563270675,
"grad_norm": 0.3137075106480887,
"learning_rate": 4.977465042414314e-06,
"loss": 0.6185,
"step": 242
},
{
"epoch": 0.2130761221022634,
"grad_norm": 0.2841757272525764,
"learning_rate": 4.9771470172073985e-06,
"loss": 0.6394,
"step": 243
},
{
"epoch": 0.21395297857182002,
"grad_norm": 0.289636229771129,
"learning_rate": 4.976826773930076e-06,
"loss": 0.6314,
"step": 244
},
{
"epoch": 0.21482983504137668,
"grad_norm": 0.30163996035868273,
"learning_rate": 4.976504312869093e-06,
"loss": 0.6347,
"step": 245
},
{
"epoch": 0.2157066915109333,
"grad_norm": 0.261372963985366,
"learning_rate": 4.976179634313187e-06,
"loss": 0.6378,
"step": 246
},
{
"epoch": 0.21658354798048995,
"grad_norm": 0.3277256326536918,
"learning_rate": 4.97585273855308e-06,
"loss": 0.6326,
"step": 247
},
{
"epoch": 0.21746040445004658,
"grad_norm": 0.2609300415027874,
"learning_rate": 4.975523625881478e-06,
"loss": 0.643,
"step": 248
},
{
"epoch": 0.21833726091960323,
"grad_norm": 0.360435554160976,
"learning_rate": 4.975192296593072e-06,
"loss": 0.6301,
"step": 249
},
{
"epoch": 0.21921411738915986,
"grad_norm": 0.33545569496984357,
"learning_rate": 4.97485875098454e-06,
"loss": 0.6263,
"step": 250
},
{
"epoch": 0.2200909738587165,
"grad_norm": 0.3109257543138659,
"learning_rate": 4.974522989354544e-06,
"loss": 0.6409,
"step": 251
},
{
"epoch": 0.22096783032827313,
"grad_norm": 0.324992218124581,
"learning_rate": 4.974185012003727e-06,
"loss": 0.634,
"step": 252
},
{
"epoch": 0.2218446867978298,
"grad_norm": 0.32486130027399085,
"learning_rate": 4.97384481923472e-06,
"loss": 0.6164,
"step": 253
},
{
"epoch": 0.2227215432673864,
"grad_norm": 0.37258515700556377,
"learning_rate": 4.973502411352136e-06,
"loss": 0.6387,
"step": 254
},
{
"epoch": 0.22359839973694307,
"grad_norm": 0.29043553996012594,
"learning_rate": 4.97315778866257e-06,
"loss": 0.6287,
"step": 255
},
{
"epoch": 0.2244752562064997,
"grad_norm": 0.36257038619483317,
"learning_rate": 4.972810951474605e-06,
"loss": 0.6343,
"step": 256
},
{
"epoch": 0.22535211267605634,
"grad_norm": 0.2772793728031826,
"learning_rate": 4.972461900098801e-06,
"loss": 0.6289,
"step": 257
},
{
"epoch": 0.22622896914561297,
"grad_norm": 0.35920004083908574,
"learning_rate": 4.972110634847703e-06,
"loss": 0.6532,
"step": 258
},
{
"epoch": 0.22710582561516962,
"grad_norm": 0.29471007707943336,
"learning_rate": 4.97175715603584e-06,
"loss": 0.6431,
"step": 259
},
{
"epoch": 0.22798268208472625,
"grad_norm": 0.3052965075835166,
"learning_rate": 4.971401463979722e-06,
"loss": 0.6373,
"step": 260
},
{
"epoch": 0.2288595385542829,
"grad_norm": 0.27702925326859024,
"learning_rate": 4.971043558997839e-06,
"loss": 0.6254,
"step": 261
},
{
"epoch": 0.22973639502383952,
"grad_norm": 0.30905022457424325,
"learning_rate": 4.9706834414106645e-06,
"loss": 0.6377,
"step": 262
},
{
"epoch": 0.23061325149339618,
"grad_norm": 0.2820956276882666,
"learning_rate": 4.970321111540652e-06,
"loss": 0.6303,
"step": 263
},
{
"epoch": 0.2314901079629528,
"grad_norm": 0.3394900289735489,
"learning_rate": 4.969956569712238e-06,
"loss": 0.6394,
"step": 264
},
{
"epoch": 0.23236696443250945,
"grad_norm": 0.26647926556067275,
"learning_rate": 4.969589816251837e-06,
"loss": 0.6202,
"step": 265
},
{
"epoch": 0.2332438209020661,
"grad_norm": 0.3281231898594553,
"learning_rate": 4.9692208514878445e-06,
"loss": 0.6343,
"step": 266
},
{
"epoch": 0.23412067737162273,
"grad_norm": 0.32675488207496506,
"learning_rate": 4.968849675750638e-06,
"loss": 0.6106,
"step": 267
},
{
"epoch": 0.23499753384117938,
"grad_norm": 0.28838375524590465,
"learning_rate": 4.9684762893725715e-06,
"loss": 0.6191,
"step": 268
},
{
"epoch": 0.235874390310736,
"grad_norm": 0.3568027126734991,
"learning_rate": 4.968100692687981e-06,
"loss": 0.6492,
"step": 269
},
{
"epoch": 0.23675124678029266,
"grad_norm": 0.28443576918161984,
"learning_rate": 4.967722886033181e-06,
"loss": 0.6332,
"step": 270
},
{
"epoch": 0.2376281032498493,
"grad_norm": 0.34347891151295074,
"learning_rate": 4.967342869746463e-06,
"loss": 0.6302,
"step": 271
},
{
"epoch": 0.23850495971940594,
"grad_norm": 0.26856199334324765,
"learning_rate": 4.9669606441681005e-06,
"loss": 0.6253,
"step": 272
},
{
"epoch": 0.23938181618896257,
"grad_norm": 0.28792821400673596,
"learning_rate": 4.966576209640344e-06,
"loss": 0.617,
"step": 273
},
{
"epoch": 0.24025867265851922,
"grad_norm": 0.2749481611356667,
"learning_rate": 4.966189566507418e-06,
"loss": 0.6386,
"step": 274
},
{
"epoch": 0.24113552912807584,
"grad_norm": 0.2499995559979677,
"learning_rate": 4.965800715115531e-06,
"loss": 0.6281,
"step": 275
},
{
"epoch": 0.2420123855976325,
"grad_norm": 0.2802197876098476,
"learning_rate": 4.965409655812865e-06,
"loss": 0.6356,
"step": 276
},
{
"epoch": 0.24288924206718912,
"grad_norm": 0.27112050232805884,
"learning_rate": 4.965016388949579e-06,
"loss": 0.6366,
"step": 277
},
{
"epoch": 0.24376609853674577,
"grad_norm": 0.28745747065199806,
"learning_rate": 4.96462091487781e-06,
"loss": 0.6245,
"step": 278
},
{
"epoch": 0.2446429550063024,
"grad_norm": 0.29635776688822807,
"learning_rate": 4.96422323395167e-06,
"loss": 0.6413,
"step": 279
},
{
"epoch": 0.24551981147585905,
"grad_norm": 0.3376283192201481,
"learning_rate": 4.963823346527249e-06,
"loss": 0.6322,
"step": 280
},
{
"epoch": 0.24639666794541568,
"grad_norm": 0.30520044326595835,
"learning_rate": 4.96342125296261e-06,
"loss": 0.6173,
"step": 281
},
{
"epoch": 0.24727352441497233,
"grad_norm": 0.34476437566601653,
"learning_rate": 4.963016953617794e-06,
"loss": 0.6172,
"step": 282
},
{
"epoch": 0.24815038088452895,
"grad_norm": 0.2611205789369605,
"learning_rate": 4.962610448854816e-06,
"loss": 0.6246,
"step": 283
},
{
"epoch": 0.2490272373540856,
"grad_norm": 0.3294938430549001,
"learning_rate": 4.962201739037665e-06,
"loss": 0.632,
"step": 284
},
{
"epoch": 0.24990409382364223,
"grad_norm": 0.2716869569081184,
"learning_rate": 4.961790824532306e-06,
"loss": 0.6285,
"step": 285
},
{
"epoch": 0.25078095029319886,
"grad_norm": 0.33415021484488,
"learning_rate": 4.961377705706677e-06,
"loss": 0.6295,
"step": 286
},
{
"epoch": 0.2516578067627555,
"grad_norm": 0.3077857421614378,
"learning_rate": 4.960962382930691e-06,
"loss": 0.6273,
"step": 287
},
{
"epoch": 0.25253466323231216,
"grad_norm": 0.3027918805177667,
"learning_rate": 4.960544856576232e-06,
"loss": 0.629,
"step": 288
},
{
"epoch": 0.2534115197018688,
"grad_norm": 0.2916258020649895,
"learning_rate": 4.960125127017159e-06,
"loss": 0.6427,
"step": 289
},
{
"epoch": 0.2542883761714254,
"grad_norm": 0.3152484231550671,
"learning_rate": 4.959703194629304e-06,
"loss": 0.6348,
"step": 290
},
{
"epoch": 0.25516523264098206,
"grad_norm": 0.32915709407999866,
"learning_rate": 4.959279059790471e-06,
"loss": 0.632,
"step": 291
},
{
"epoch": 0.2560420891105387,
"grad_norm": 0.2817567268029023,
"learning_rate": 4.958852722880435e-06,
"loss": 0.6112,
"step": 292
},
{
"epoch": 0.25691894558009537,
"grad_norm": 0.3538236182060425,
"learning_rate": 4.958424184280946e-06,
"loss": 0.6241,
"step": 293
},
{
"epoch": 0.257795802049652,
"grad_norm": 0.2864183700965389,
"learning_rate": 4.957993444375719e-06,
"loss": 0.6277,
"step": 294
},
{
"epoch": 0.2586726585192086,
"grad_norm": 0.33515303575483923,
"learning_rate": 4.95756050355045e-06,
"loss": 0.6277,
"step": 295
},
{
"epoch": 0.2595495149887653,
"grad_norm": 0.31975746198582533,
"learning_rate": 4.957125362192794e-06,
"loss": 0.6114,
"step": 296
},
{
"epoch": 0.2604263714583219,
"grad_norm": 0.34329553758734277,
"learning_rate": 4.956688020692386e-06,
"loss": 0.6457,
"step": 297
},
{
"epoch": 0.2613032279278786,
"grad_norm": 0.3122307785419701,
"learning_rate": 4.956248479440827e-06,
"loss": 0.6272,
"step": 298
},
{
"epoch": 0.2621800843974352,
"grad_norm": 0.3126439049869492,
"learning_rate": 4.955806738831687e-06,
"loss": 0.634,
"step": 299
},
{
"epoch": 0.26305694086699183,
"grad_norm": 0.30725526373905826,
"learning_rate": 4.955362799260507e-06,
"loss": 0.6269,
"step": 300
},
{
"epoch": 0.2639337973365485,
"grad_norm": 0.2952615284346605,
"learning_rate": 4.954916661124797e-06,
"loss": 0.6129,
"step": 301
},
{
"epoch": 0.26481065380610513,
"grad_norm": 0.3284069744839045,
"learning_rate": 4.954468324824035e-06,
"loss": 0.613,
"step": 302
},
{
"epoch": 0.26568751027566173,
"grad_norm": 0.34051928196991404,
"learning_rate": 4.954017790759666e-06,
"loss": 0.6192,
"step": 303
},
{
"epoch": 0.2665643667452184,
"grad_norm": 0.30608255552211977,
"learning_rate": 4.953565059335104e-06,
"loss": 0.6244,
"step": 304
},
{
"epoch": 0.26744122321477504,
"grad_norm": 0.31501722301988566,
"learning_rate": 4.953110130955733e-06,
"loss": 0.6236,
"step": 305
},
{
"epoch": 0.2683180796843317,
"grad_norm": 0.2978345978834651,
"learning_rate": 4.9526530060289e-06,
"loss": 0.6254,
"step": 306
},
{
"epoch": 0.2691949361538883,
"grad_norm": 0.2935986604058687,
"learning_rate": 4.952193684963922e-06,
"loss": 0.6113,
"step": 307
},
{
"epoch": 0.27007179262344494,
"grad_norm": 0.294670736028252,
"learning_rate": 4.95173216817208e-06,
"loss": 0.6335,
"step": 308
},
{
"epoch": 0.2709486490930016,
"grad_norm": 0.2746280487759909,
"learning_rate": 4.951268456066623e-06,
"loss": 0.6211,
"step": 309
},
{
"epoch": 0.27182550556255825,
"grad_norm": 0.2823209312944346,
"learning_rate": 4.950802549062764e-06,
"loss": 0.621,
"step": 310
},
{
"epoch": 0.27270236203211484,
"grad_norm": 0.2811005060766513,
"learning_rate": 4.950334447577685e-06,
"loss": 0.6291,
"step": 311
},
{
"epoch": 0.2735792185016715,
"grad_norm": 0.31377780747479117,
"learning_rate": 4.9498641520305264e-06,
"loss": 0.6308,
"step": 312
},
{
"epoch": 0.27445607497122815,
"grad_norm": 0.263859895152384,
"learning_rate": 4.949391662842401e-06,
"loss": 0.6238,
"step": 313
},
{
"epoch": 0.2753329314407848,
"grad_norm": 0.3124591272767995,
"learning_rate": 4.948916980436379e-06,
"loss": 0.6254,
"step": 314
},
{
"epoch": 0.27620978791034145,
"grad_norm": 0.2762091249470148,
"learning_rate": 4.948440105237499e-06,
"loss": 0.6297,
"step": 315
},
{
"epoch": 0.27708664437989805,
"grad_norm": 0.30510467983773004,
"learning_rate": 4.947961037672761e-06,
"loss": 0.6301,
"step": 316
},
{
"epoch": 0.2779635008494547,
"grad_norm": 0.2894218681866538,
"learning_rate": 4.947479778171127e-06,
"loss": 0.6215,
"step": 317
},
{
"epoch": 0.27884035731901136,
"grad_norm": 0.278604444379188,
"learning_rate": 4.946996327163526e-06,
"loss": 0.6193,
"step": 318
},
{
"epoch": 0.279717213788568,
"grad_norm": 0.29226196825962947,
"learning_rate": 4.946510685082844e-06,
"loss": 0.6205,
"step": 319
},
{
"epoch": 0.2805940702581246,
"grad_norm": 0.2956824922950759,
"learning_rate": 4.946022852363932e-06,
"loss": 0.6238,
"step": 320
},
{
"epoch": 0.28147092672768126,
"grad_norm": 0.28796938907697983,
"learning_rate": 4.945532829443604e-06,
"loss": 0.6176,
"step": 321
},
{
"epoch": 0.2823477831972379,
"grad_norm": 0.2688847498978228,
"learning_rate": 4.945040616760629e-06,
"loss": 0.6178,
"step": 322
},
{
"epoch": 0.28322463966679456,
"grad_norm": 0.3167327299209847,
"learning_rate": 4.944546214755744e-06,
"loss": 0.6315,
"step": 323
},
{
"epoch": 0.28410149613635116,
"grad_norm": 0.28346482132020456,
"learning_rate": 4.9440496238716415e-06,
"loss": 0.6281,
"step": 324
},
{
"epoch": 0.2849783526059078,
"grad_norm": 0.2862108698161924,
"learning_rate": 4.943550844552978e-06,
"loss": 0.6445,
"step": 325
},
{
"epoch": 0.28585520907546447,
"grad_norm": 0.3168994194030117,
"learning_rate": 4.943049877246363e-06,
"loss": 0.6336,
"step": 326
},
{
"epoch": 0.2867320655450211,
"grad_norm": 0.3098419113094991,
"learning_rate": 4.942546722400373e-06,
"loss": 0.6194,
"step": 327
},
{
"epoch": 0.2876089220145777,
"grad_norm": 0.3076330226750193,
"learning_rate": 4.942041380465539e-06,
"loss": 0.6332,
"step": 328
},
{
"epoch": 0.28848577848413437,
"grad_norm": 0.3073675940253473,
"learning_rate": 4.941533851894349e-06,
"loss": 0.6329,
"step": 329
},
{
"epoch": 0.289362634953691,
"grad_norm": 0.27407015238515836,
"learning_rate": 4.9410241371412525e-06,
"loss": 0.6292,
"step": 330
},
{
"epoch": 0.2902394914232477,
"grad_norm": 0.3233677059379673,
"learning_rate": 4.9405122366626545e-06,
"loss": 0.6407,
"step": 331
},
{
"epoch": 0.2911163478928043,
"grad_norm": 0.3056326849325438,
"learning_rate": 4.939998150916917e-06,
"loss": 0.6314,
"step": 332
},
{
"epoch": 0.2919932043623609,
"grad_norm": 0.3140138519054107,
"learning_rate": 4.93948188036436e-06,
"loss": 0.6583,
"step": 333
},
{
"epoch": 0.2928700608319176,
"grad_norm": 0.2967689552064628,
"learning_rate": 4.938963425467258e-06,
"loss": 0.6349,
"step": 334
},
{
"epoch": 0.29374691730147423,
"grad_norm": 0.35320572702474673,
"learning_rate": 4.938442786689843e-06,
"loss": 0.6248,
"step": 335
},
{
"epoch": 0.29462377377103083,
"grad_norm": 0.2958836632865014,
"learning_rate": 4.9379199644983025e-06,
"loss": 0.6255,
"step": 336
},
{
"epoch": 0.2955006302405875,
"grad_norm": 0.3054952399371344,
"learning_rate": 4.937394959360777e-06,
"loss": 0.6119,
"step": 337
},
{
"epoch": 0.29637748671014413,
"grad_norm": 0.34308383177638463,
"learning_rate": 4.9368677717473645e-06,
"loss": 0.6468,
"step": 338
},
{
"epoch": 0.2972543431797008,
"grad_norm": 0.2648620374237178,
"learning_rate": 4.936338402130115e-06,
"loss": 0.6203,
"step": 339
},
{
"epoch": 0.29813119964925744,
"grad_norm": 0.2976099930186866,
"learning_rate": 4.935806850983034e-06,
"loss": 0.6348,
"step": 340
},
{
"epoch": 0.29900805611881404,
"grad_norm": 0.285144357181017,
"learning_rate": 4.935273118782078e-06,
"loss": 0.6115,
"step": 341
},
{
"epoch": 0.2998849125883707,
"grad_norm": 0.3079688238524965,
"learning_rate": 4.934737206005159e-06,
"loss": 0.6254,
"step": 342
},
{
"epoch": 0.30076176905792734,
"grad_norm": 0.27719094781494596,
"learning_rate": 4.93419911313214e-06,
"loss": 0.6386,
"step": 343
},
{
"epoch": 0.301638625527484,
"grad_norm": 0.29796636665366355,
"learning_rate": 4.933658840644837e-06,
"loss": 0.6268,
"step": 344
},
{
"epoch": 0.3025154819970406,
"grad_norm": 0.27509893042636935,
"learning_rate": 4.933116389027017e-06,
"loss": 0.621,
"step": 345
},
{
"epoch": 0.30339233846659724,
"grad_norm": 0.31224342373584874,
"learning_rate": 4.932571758764398e-06,
"loss": 0.6312,
"step": 346
},
{
"epoch": 0.3042691949361539,
"grad_norm": 0.2689144896057607,
"learning_rate": 4.93202495034465e-06,
"loss": 0.6115,
"step": 347
},
{
"epoch": 0.30514605140571055,
"grad_norm": 0.2558266510993566,
"learning_rate": 4.931475964257391e-06,
"loss": 0.6245,
"step": 348
},
{
"epoch": 0.30602290787526715,
"grad_norm": 0.25500762407211314,
"learning_rate": 4.930924800994192e-06,
"loss": 0.6091,
"step": 349
},
{
"epoch": 0.3068997643448238,
"grad_norm": 0.2717131638453367,
"learning_rate": 4.9303714610485705e-06,
"loss": 0.6281,
"step": 350
},
{
"epoch": 0.30777662081438045,
"grad_norm": 0.2729400616989181,
"learning_rate": 4.929815944915997e-06,
"loss": 0.6083,
"step": 351
},
{
"epoch": 0.3086534772839371,
"grad_norm": 0.26000631857019024,
"learning_rate": 4.929258253093885e-06,
"loss": 0.6198,
"step": 352
},
{
"epoch": 0.3095303337534937,
"grad_norm": 0.2740884453189882,
"learning_rate": 4.9286983860816e-06,
"loss": 0.6338,
"step": 353
},
{
"epoch": 0.31040719022305036,
"grad_norm": 0.27150990388252366,
"learning_rate": 4.928136344380457e-06,
"loss": 0.6162,
"step": 354
},
{
"epoch": 0.311284046692607,
"grad_norm": 0.26286571771385,
"learning_rate": 4.9275721284937115e-06,
"loss": 0.629,
"step": 355
},
{
"epoch": 0.31216090316216366,
"grad_norm": 0.27510252961865267,
"learning_rate": 4.9270057389265734e-06,
"loss": 0.633,
"step": 356
},
{
"epoch": 0.31303775963172026,
"grad_norm": 0.2825214790660817,
"learning_rate": 4.926437176186193e-06,
"loss": 0.6263,
"step": 357
},
{
"epoch": 0.3139146161012769,
"grad_norm": 0.29292375908331497,
"learning_rate": 4.92586644078167e-06,
"loss": 0.6313,
"step": 358
},
{
"epoch": 0.31479147257083356,
"grad_norm": 0.2760563004495057,
"learning_rate": 4.925293533224049e-06,
"loss": 0.6174,
"step": 359
},
{
"epoch": 0.3156683290403902,
"grad_norm": 0.29078508943452525,
"learning_rate": 4.924718454026318e-06,
"loss": 0.6156,
"step": 360
},
{
"epoch": 0.3165451855099468,
"grad_norm": 0.2878769173523044,
"learning_rate": 4.924141203703412e-06,
"loss": 0.6047,
"step": 361
},
{
"epoch": 0.31742204197950347,
"grad_norm": 0.27485843884417593,
"learning_rate": 4.923561782772206e-06,
"loss": 0.6293,
"step": 362
},
{
"epoch": 0.3182988984490601,
"grad_norm": 0.2865164028316351,
"learning_rate": 4.922980191751524e-06,
"loss": 0.6269,
"step": 363
},
{
"epoch": 0.31917575491861677,
"grad_norm": 0.27991173694279825,
"learning_rate": 4.922396431162129e-06,
"loss": 0.6143,
"step": 364
},
{
"epoch": 0.3200526113881734,
"grad_norm": 0.279639353480309,
"learning_rate": 4.921810501526728e-06,
"loss": 0.635,
"step": 365
},
{
"epoch": 0.32092946785773,
"grad_norm": 0.2830142803081013,
"learning_rate": 4.921222403369971e-06,
"loss": 0.6157,
"step": 366
},
{
"epoch": 0.3218063243272867,
"grad_norm": 0.2684155306717856,
"learning_rate": 4.920632137218447e-06,
"loss": 0.6294,
"step": 367
},
{
"epoch": 0.3226831807968433,
"grad_norm": 0.2983455576981931,
"learning_rate": 4.920039703600691e-06,
"loss": 0.624,
"step": 368
},
{
"epoch": 0.3235600372664,
"grad_norm": 0.2948947231333358,
"learning_rate": 4.9194451030471735e-06,
"loss": 0.6102,
"step": 369
},
{
"epoch": 0.3244368937359566,
"grad_norm": 0.2826890911442374,
"learning_rate": 4.918848336090309e-06,
"loss": 0.6236,
"step": 370
},
{
"epoch": 0.32531375020551323,
"grad_norm": 0.32269493597939386,
"learning_rate": 4.91824940326445e-06,
"loss": 0.6139,
"step": 371
},
{
"epoch": 0.3261906066750699,
"grad_norm": 0.2734983777513044,
"learning_rate": 4.91764830510589e-06,
"loss": 0.6166,
"step": 372
},
{
"epoch": 0.32706746314462654,
"grad_norm": 0.36983262498880637,
"learning_rate": 4.917045042152858e-06,
"loss": 0.6186,
"step": 373
},
{
"epoch": 0.32794431961418313,
"grad_norm": 0.2751996219950251,
"learning_rate": 4.916439614945527e-06,
"loss": 0.6412,
"step": 374
},
{
"epoch": 0.3288211760837398,
"grad_norm": 0.319865198714037,
"learning_rate": 4.915832024026002e-06,
"loss": 0.627,
"step": 375
},
{
"epoch": 0.32969803255329644,
"grad_norm": 0.29823421688781576,
"learning_rate": 4.915222269938328e-06,
"loss": 0.6181,
"step": 376
},
{
"epoch": 0.3305748890228531,
"grad_norm": 0.27335542421500575,
"learning_rate": 4.914610353228488e-06,
"loss": 0.6202,
"step": 377
},
{
"epoch": 0.3314517454924097,
"grad_norm": 0.3824213724235341,
"learning_rate": 4.913996274444401e-06,
"loss": 0.608,
"step": 378
},
{
"epoch": 0.33232860196196634,
"grad_norm": 0.3269271239671324,
"learning_rate": 4.913380034135919e-06,
"loss": 0.6229,
"step": 379
},
{
"epoch": 0.333205458431523,
"grad_norm": 0.2832871290462529,
"learning_rate": 4.912761632854834e-06,
"loss": 0.618,
"step": 380
},
{
"epoch": 0.33408231490107965,
"grad_norm": 0.329936751234759,
"learning_rate": 4.912141071154869e-06,
"loss": 0.6231,
"step": 381
},
{
"epoch": 0.33495917137063624,
"grad_norm": 0.2752693680315103,
"learning_rate": 4.911518349591685e-06,
"loss": 0.6234,
"step": 382
},
{
"epoch": 0.3358360278401929,
"grad_norm": 0.3136704903953731,
"learning_rate": 4.9108934687228735e-06,
"loss": 0.6248,
"step": 383
},
{
"epoch": 0.33671288430974955,
"grad_norm": 0.2947450161853734,
"learning_rate": 4.910266429107962e-06,
"loss": 0.6291,
"step": 384
},
{
"epoch": 0.3375897407793062,
"grad_norm": 0.27963622109645897,
"learning_rate": 4.90963723130841e-06,
"loss": 0.6168,
"step": 385
},
{
"epoch": 0.3384665972488628,
"grad_norm": 0.2755048673546131,
"learning_rate": 4.90900587588761e-06,
"loss": 0.6022,
"step": 386
},
{
"epoch": 0.33934345371841945,
"grad_norm": 0.28857281828902753,
"learning_rate": 4.908372363410886e-06,
"loss": 0.6254,
"step": 387
},
{
"epoch": 0.3402203101879761,
"grad_norm": 0.28648556573019374,
"learning_rate": 4.907736694445492e-06,
"loss": 0.6175,
"step": 388
},
{
"epoch": 0.34109716665753276,
"grad_norm": 0.26925532018377424,
"learning_rate": 4.9070988695606156e-06,
"loss": 0.6176,
"step": 389
},
{
"epoch": 0.3419740231270894,
"grad_norm": 0.2832182299890066,
"learning_rate": 4.906458889327375e-06,
"loss": 0.6291,
"step": 390
},
{
"epoch": 0.342850879596646,
"grad_norm": 0.24545023229724808,
"learning_rate": 4.905816754318815e-06,
"loss": 0.621,
"step": 391
},
{
"epoch": 0.34372773606620266,
"grad_norm": 0.27071805276574584,
"learning_rate": 4.905172465109912e-06,
"loss": 0.6235,
"step": 392
},
{
"epoch": 0.3446045925357593,
"grad_norm": 0.2686211222363871,
"learning_rate": 4.904526022277572e-06,
"loss": 0.6259,
"step": 393
},
{
"epoch": 0.34548144900531597,
"grad_norm": 0.2788582786567745,
"learning_rate": 4.903877426400629e-06,
"loss": 0.6113,
"step": 394
},
{
"epoch": 0.34635830547487256,
"grad_norm": 0.2882303517807228,
"learning_rate": 4.903226678059842e-06,
"loss": 0.6325,
"step": 395
},
{
"epoch": 0.3472351619444292,
"grad_norm": 0.26417391198725343,
"learning_rate": 4.902573777837902e-06,
"loss": 0.6171,
"step": 396
},
{
"epoch": 0.34811201841398587,
"grad_norm": 0.27931172516771346,
"learning_rate": 4.901918726319424e-06,
"loss": 0.6041,
"step": 397
},
{
"epoch": 0.3489888748835425,
"grad_norm": 0.24713049818043734,
"learning_rate": 4.901261524090949e-06,
"loss": 0.6099,
"step": 398
},
{
"epoch": 0.3498657313530991,
"grad_norm": 0.29086241382146505,
"learning_rate": 4.900602171740946e-06,
"loss": 0.6258,
"step": 399
},
{
"epoch": 0.35074258782265577,
"grad_norm": 0.26291418203363,
"learning_rate": 4.899940669859807e-06,
"loss": 0.6117,
"step": 400
},
{
"epoch": 0.3516194442922124,
"grad_norm": 0.3216617316096804,
"learning_rate": 4.89927701903985e-06,
"loss": 0.6187,
"step": 401
},
{
"epoch": 0.3524963007617691,
"grad_norm": 0.27295463776878537,
"learning_rate": 4.898611219875316e-06,
"loss": 0.6132,
"step": 402
},
{
"epoch": 0.3533731572313257,
"grad_norm": 0.2853334578601736,
"learning_rate": 4.897943272962372e-06,
"loss": 0.6148,
"step": 403
},
{
"epoch": 0.3542500137008823,
"grad_norm": 0.31932832747253076,
"learning_rate": 4.897273178899105e-06,
"loss": 0.6187,
"step": 404
},
{
"epoch": 0.355126870170439,
"grad_norm": 0.28031643219296354,
"learning_rate": 4.896600938285526e-06,
"loss": 0.6236,
"step": 405
},
{
"epoch": 0.35600372663999563,
"grad_norm": 0.26831626886851945,
"learning_rate": 4.89592655172357e-06,
"loss": 0.6102,
"step": 406
},
{
"epoch": 0.35688058310955223,
"grad_norm": 0.2951228212133584,
"learning_rate": 4.895250019817089e-06,
"loss": 0.6164,
"step": 407
},
{
"epoch": 0.3577574395791089,
"grad_norm": 0.27330142007513136,
"learning_rate": 4.894571343171862e-06,
"loss": 0.6023,
"step": 408
},
{
"epoch": 0.35863429604866554,
"grad_norm": 0.3204620119402923,
"learning_rate": 4.893890522395582e-06,
"loss": 0.62,
"step": 409
},
{
"epoch": 0.3595111525182222,
"grad_norm": 0.261478566125417,
"learning_rate": 4.893207558097867e-06,
"loss": 0.6294,
"step": 410
},
{
"epoch": 0.36038800898777884,
"grad_norm": 0.250895473885103,
"learning_rate": 4.892522450890251e-06,
"loss": 0.6152,
"step": 411
},
{
"epoch": 0.36126486545733544,
"grad_norm": 0.2634865561040139,
"learning_rate": 4.89183520138619e-06,
"loss": 0.6157,
"step": 412
},
{
"epoch": 0.3621417219268921,
"grad_norm": 0.26459491662331874,
"learning_rate": 4.891145810201054e-06,
"loss": 0.609,
"step": 413
},
{
"epoch": 0.36301857839644874,
"grad_norm": 0.24301745655990745,
"learning_rate": 4.8904542779521346e-06,
"loss": 0.6082,
"step": 414
},
{
"epoch": 0.3638954348660054,
"grad_norm": 0.2692643109083729,
"learning_rate": 4.8897606052586384e-06,
"loss": 0.6226,
"step": 415
},
{
"epoch": 0.364772291335562,
"grad_norm": 0.24024671108707563,
"learning_rate": 4.889064792741689e-06,
"loss": 0.6153,
"step": 416
},
{
"epoch": 0.36564914780511865,
"grad_norm": 0.273288282597359,
"learning_rate": 4.888366841024327e-06,
"loss": 0.6334,
"step": 417
},
{
"epoch": 0.3665260042746753,
"grad_norm": 0.2713735341001686,
"learning_rate": 4.887666750731507e-06,
"loss": 0.6204,
"step": 418
},
{
"epoch": 0.36740286074423195,
"grad_norm": 0.2749014394381958,
"learning_rate": 4.8869645224901e-06,
"loss": 0.6017,
"step": 419
},
{
"epoch": 0.36827971721378855,
"grad_norm": 0.27621114898765087,
"learning_rate": 4.8862601569288885e-06,
"loss": 0.6193,
"step": 420
},
{
"epoch": 0.3691565736833452,
"grad_norm": 0.25931507650511326,
"learning_rate": 4.885553654678573e-06,
"loss": 0.6233,
"step": 421
},
{
"epoch": 0.37003343015290185,
"grad_norm": 0.28686169175433923,
"learning_rate": 4.884845016371763e-06,
"loss": 0.6197,
"step": 422
},
{
"epoch": 0.3709102866224585,
"grad_norm": 0.27025382919889446,
"learning_rate": 4.884134242642985e-06,
"loss": 0.6033,
"step": 423
},
{
"epoch": 0.3717871430920151,
"grad_norm": 0.275669477293775,
"learning_rate": 4.883421334128674e-06,
"loss": 0.6172,
"step": 424
},
{
"epoch": 0.37266399956157176,
"grad_norm": 0.26014021950194516,
"learning_rate": 4.8827062914671775e-06,
"loss": 0.6207,
"step": 425
},
{
"epoch": 0.3735408560311284,
"grad_norm": 0.2986829920255015,
"learning_rate": 4.881989115298755e-06,
"loss": 0.6034,
"step": 426
},
{
"epoch": 0.37441771250068506,
"grad_norm": 0.28151692244357057,
"learning_rate": 4.881269806265575e-06,
"loss": 0.6133,
"step": 427
},
{
"epoch": 0.37529456897024166,
"grad_norm": 0.2932206682237993,
"learning_rate": 4.8805483650117154e-06,
"loss": 0.6132,
"step": 428
},
{
"epoch": 0.3761714254397983,
"grad_norm": 0.3164265338412961,
"learning_rate": 4.879824792183166e-06,
"loss": 0.6077,
"step": 429
},
{
"epoch": 0.37704828190935497,
"grad_norm": 0.3636164115457003,
"learning_rate": 4.879099088427824e-06,
"loss": 0.6179,
"step": 430
},
{
"epoch": 0.3779251383789116,
"grad_norm": 0.2891875334309757,
"learning_rate": 4.878371254395492e-06,
"loss": 0.6197,
"step": 431
},
{
"epoch": 0.3788019948484682,
"grad_norm": 0.3816104662619605,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.6197,
"step": 432
},
{
"epoch": 0.37967885131802487,
"grad_norm": 0.29131497715708005,
"learning_rate": 4.876909198108619e-06,
"loss": 0.6159,
"step": 433
},
{
"epoch": 0.3805557077875815,
"grad_norm": 0.3138520265609416,
"learning_rate": 4.876174977163222e-06,
"loss": 0.6139,
"step": 434
},
{
"epoch": 0.3814325642571382,
"grad_norm": 0.28035852092093033,
"learning_rate": 4.875438628559124e-06,
"loss": 0.6183,
"step": 435
},
{
"epoch": 0.3823094207266948,
"grad_norm": 0.3120106817898386,
"learning_rate": 4.874700152955661e-06,
"loss": 0.6052,
"step": 436
},
{
"epoch": 0.3831862771962514,
"grad_norm": 0.29139666929908226,
"learning_rate": 4.873959551014075e-06,
"loss": 0.6058,
"step": 437
},
{
"epoch": 0.3840631336658081,
"grad_norm": 0.31305383154436955,
"learning_rate": 4.873216823397511e-06,
"loss": 0.6094,
"step": 438
},
{
"epoch": 0.38493999013536473,
"grad_norm": 0.3052879988977325,
"learning_rate": 4.872471970771015e-06,
"loss": 0.6063,
"step": 439
},
{
"epoch": 0.3858168466049214,
"grad_norm": 0.2965934350138861,
"learning_rate": 4.871724993801541e-06,
"loss": 0.6054,
"step": 440
},
{
"epoch": 0.386693703074478,
"grad_norm": 0.26339362714008424,
"learning_rate": 4.870975893157941e-06,
"loss": 0.6152,
"step": 441
},
{
"epoch": 0.38757055954403463,
"grad_norm": 0.27556079714679943,
"learning_rate": 4.870224669510968e-06,
"loss": 0.6158,
"step": 442
},
{
"epoch": 0.3884474160135913,
"grad_norm": 0.29125701036171053,
"learning_rate": 4.86947132353328e-06,
"loss": 0.6202,
"step": 443
},
{
"epoch": 0.38932427248314794,
"grad_norm": 0.2966406156980298,
"learning_rate": 4.868715855899432e-06,
"loss": 0.6265,
"step": 444
},
{
"epoch": 0.39020112895270453,
"grad_norm": 0.27733217518457043,
"learning_rate": 4.867958267285879e-06,
"loss": 0.6068,
"step": 445
},
{
"epoch": 0.3910779854222612,
"grad_norm": 0.2919788828093281,
"learning_rate": 4.8671985583709765e-06,
"loss": 0.6208,
"step": 446
},
{
"epoch": 0.39195484189181784,
"grad_norm": 0.29327731039840055,
"learning_rate": 4.866436729834979e-06,
"loss": 0.6175,
"step": 447
},
{
"epoch": 0.3928316983613745,
"grad_norm": 0.2568832744529454,
"learning_rate": 4.865672782360037e-06,
"loss": 0.6177,
"step": 448
},
{
"epoch": 0.3937085548309311,
"grad_norm": 0.283654204460893,
"learning_rate": 4.8649067166301985e-06,
"loss": 0.6203,
"step": 449
},
{
"epoch": 0.39458541130048774,
"grad_norm": 0.26828805221375346,
"learning_rate": 4.864138533331411e-06,
"loss": 0.6118,
"step": 450
},
{
"epoch": 0.3954622677700444,
"grad_norm": 0.2597158618871073,
"learning_rate": 4.863368233151514e-06,
"loss": 0.6169,
"step": 451
},
{
"epoch": 0.39633912423960105,
"grad_norm": 0.28436035142498156,
"learning_rate": 4.862595816780246e-06,
"loss": 0.632,
"step": 452
},
{
"epoch": 0.39721598070915765,
"grad_norm": 0.2652505819829089,
"learning_rate": 4.861821284909238e-06,
"loss": 0.6289,
"step": 453
},
{
"epoch": 0.3980928371787143,
"grad_norm": 0.29252031992594624,
"learning_rate": 4.861044638232016e-06,
"loss": 0.6328,
"step": 454
},
{
"epoch": 0.39896969364827095,
"grad_norm": 0.2994469365008051,
"learning_rate": 4.860265877444001e-06,
"loss": 0.617,
"step": 455
},
{
"epoch": 0.3998465501178276,
"grad_norm": 0.2776900829822044,
"learning_rate": 4.8594850032425036e-06,
"loss": 0.608,
"step": 456
},
{
"epoch": 0.4007234065873842,
"grad_norm": 0.2753322141436327,
"learning_rate": 4.858702016326731e-06,
"loss": 0.607,
"step": 457
},
{
"epoch": 0.40160026305694085,
"grad_norm": 0.2738219915396828,
"learning_rate": 4.857916917397779e-06,
"loss": 0.6043,
"step": 458
},
{
"epoch": 0.4024771195264975,
"grad_norm": 0.27192665887665013,
"learning_rate": 4.857129707158637e-06,
"loss": 0.6376,
"step": 459
},
{
"epoch": 0.40335397599605416,
"grad_norm": 0.27689826150792163,
"learning_rate": 4.8563403863141825e-06,
"loss": 0.6172,
"step": 460
},
{
"epoch": 0.4042308324656108,
"grad_norm": 0.311644665297658,
"learning_rate": 4.855548955571183e-06,
"loss": 0.6106,
"step": 461
},
{
"epoch": 0.4051076889351674,
"grad_norm": 0.2912453467934098,
"learning_rate": 4.854755415638298e-06,
"loss": 0.6129,
"step": 462
},
{
"epoch": 0.40598454540472406,
"grad_norm": 0.302939167109194,
"learning_rate": 4.853959767226072e-06,
"loss": 0.6301,
"step": 463
},
{
"epoch": 0.4068614018742807,
"grad_norm": 0.261297831693092,
"learning_rate": 4.85316201104694e-06,
"loss": 0.6136,
"step": 464
},
{
"epoch": 0.40773825834383737,
"grad_norm": 0.3154856081824323,
"learning_rate": 4.852362147815225e-06,
"loss": 0.6171,
"step": 465
},
{
"epoch": 0.40861511481339396,
"grad_norm": 0.29411022742744497,
"learning_rate": 4.8515601782471325e-06,
"loss": 0.6085,
"step": 466
},
{
"epoch": 0.4094919712829506,
"grad_norm": 0.3027595832299397,
"learning_rate": 4.8507561030607576e-06,
"loss": 0.6151,
"step": 467
},
{
"epoch": 0.41036882775250727,
"grad_norm": 0.3003092813187261,
"learning_rate": 4.84994992297608e-06,
"loss": 0.6071,
"step": 468
},
{
"epoch": 0.4112456842220639,
"grad_norm": 0.27374249219050456,
"learning_rate": 4.849141638714965e-06,
"loss": 0.6166,
"step": 469
},
{
"epoch": 0.4121225406916205,
"grad_norm": 0.3064667255626573,
"learning_rate": 4.84833125100116e-06,
"loss": 0.6024,
"step": 470
},
{
"epoch": 0.4129993971611772,
"grad_norm": 0.28188617697439766,
"learning_rate": 4.847518760560297e-06,
"loss": 0.6134,
"step": 471
},
{
"epoch": 0.4138762536307338,
"grad_norm": 0.27693005272362925,
"learning_rate": 4.846704168119892e-06,
"loss": 0.5984,
"step": 472
},
{
"epoch": 0.4147531101002905,
"grad_norm": 0.3011450154809493,
"learning_rate": 4.84588747440934e-06,
"loss": 0.5932,
"step": 473
},
{
"epoch": 0.4156299665698471,
"grad_norm": 0.25715138595393167,
"learning_rate": 4.845068680159921e-06,
"loss": 0.6101,
"step": 474
},
{
"epoch": 0.41650682303940373,
"grad_norm": 0.2963493163477849,
"learning_rate": 4.844247786104794e-06,
"loss": 0.6081,
"step": 475
},
{
"epoch": 0.4173836795089604,
"grad_norm": 0.29399759702492007,
"learning_rate": 4.8434247929789975e-06,
"loss": 0.6046,
"step": 476
},
{
"epoch": 0.41826053597851703,
"grad_norm": 0.3126535237916745,
"learning_rate": 4.842599701519451e-06,
"loss": 0.6304,
"step": 477
},
{
"epoch": 0.41913739244807363,
"grad_norm": 0.29299694878032745,
"learning_rate": 4.841772512464953e-06,
"loss": 0.6168,
"step": 478
},
{
"epoch": 0.4200142489176303,
"grad_norm": 0.289486342187316,
"learning_rate": 4.840943226556178e-06,
"loss": 0.6031,
"step": 479
},
{
"epoch": 0.42089110538718694,
"grad_norm": 0.30359254383613277,
"learning_rate": 4.840111844535682e-06,
"loss": 0.5994,
"step": 480
},
{
"epoch": 0.4217679618567436,
"grad_norm": 0.2641793447534652,
"learning_rate": 4.839278367147894e-06,
"loss": 0.6036,
"step": 481
},
{
"epoch": 0.4226448183263002,
"grad_norm": 0.29968320834098117,
"learning_rate": 4.838442795139121e-06,
"loss": 0.6193,
"step": 482
},
{
"epoch": 0.42352167479585684,
"grad_norm": 0.30614554761610074,
"learning_rate": 4.837605129257546e-06,
"loss": 0.6115,
"step": 483
},
{
"epoch": 0.4243985312654135,
"grad_norm": 0.29316129861054724,
"learning_rate": 4.836765370253223e-06,
"loss": 0.6039,
"step": 484
},
{
"epoch": 0.42527538773497015,
"grad_norm": 0.35388210389950725,
"learning_rate": 4.835923518878088e-06,
"loss": 0.6089,
"step": 485
},
{
"epoch": 0.4261522442045268,
"grad_norm": 0.27541931694811506,
"learning_rate": 4.835079575885944e-06,
"loss": 0.6129,
"step": 486
},
{
"epoch": 0.4270291006740834,
"grad_norm": 0.3408256598988536,
"learning_rate": 4.834233542032468e-06,
"loss": 0.6165,
"step": 487
},
{
"epoch": 0.42790595714364005,
"grad_norm": 0.30259946435062773,
"learning_rate": 4.83338541807521e-06,
"loss": 0.6111,
"step": 488
},
{
"epoch": 0.4287828136131967,
"grad_norm": 0.2871132966743198,
"learning_rate": 4.832535204773593e-06,
"loss": 0.6273,
"step": 489
},
{
"epoch": 0.42965967008275335,
"grad_norm": 0.3457337315321895,
"learning_rate": 4.8316829028889076e-06,
"loss": 0.6005,
"step": 490
},
{
"epoch": 0.43053652655230995,
"grad_norm": 0.2668696078107318,
"learning_rate": 4.830828513184317e-06,
"loss": 0.6122,
"step": 491
},
{
"epoch": 0.4314133830218666,
"grad_norm": 0.321068645111551,
"learning_rate": 4.829972036424854e-06,
"loss": 0.6058,
"step": 492
},
{
"epoch": 0.43229023949142326,
"grad_norm": 0.26125737492647644,
"learning_rate": 4.829113473377417e-06,
"loss": 0.6143,
"step": 493
},
{
"epoch": 0.4331670959609799,
"grad_norm": 0.32002755047063874,
"learning_rate": 4.828252824810777e-06,
"loss": 0.6061,
"step": 494
},
{
"epoch": 0.4340439524305365,
"grad_norm": 0.2863878470189295,
"learning_rate": 4.82739009149557e-06,
"loss": 0.5977,
"step": 495
},
{
"epoch": 0.43492080890009316,
"grad_norm": 0.31874371835878795,
"learning_rate": 4.826525274204297e-06,
"loss": 0.608,
"step": 496
},
{
"epoch": 0.4357976653696498,
"grad_norm": 0.2956391151217163,
"learning_rate": 4.825658373711328e-06,
"loss": 0.6107,
"step": 497
},
{
"epoch": 0.43667452183920646,
"grad_norm": 0.288406786632812,
"learning_rate": 4.824789390792899e-06,
"loss": 0.6094,
"step": 498
},
{
"epoch": 0.43755137830876306,
"grad_norm": 0.33737182032602686,
"learning_rate": 4.823918326227106e-06,
"loss": 0.5971,
"step": 499
},
{
"epoch": 0.4384282347783197,
"grad_norm": 0.25632117321609454,
"learning_rate": 4.823045180793914e-06,
"loss": 0.6044,
"step": 500
},
{
"epoch": 0.43930509124787637,
"grad_norm": 0.2978956835348055,
"learning_rate": 4.8221699552751465e-06,
"loss": 0.6009,
"step": 501
},
{
"epoch": 0.440181947717433,
"grad_norm": 0.30339339194561,
"learning_rate": 4.821292650454495e-06,
"loss": 0.6113,
"step": 502
},
{
"epoch": 0.4410588041869896,
"grad_norm": 0.3083549716587437,
"learning_rate": 4.8204132671175085e-06,
"loss": 0.6074,
"step": 503
},
{
"epoch": 0.44193566065654627,
"grad_norm": 0.291272682255802,
"learning_rate": 4.819531806051599e-06,
"loss": 0.606,
"step": 504
},
{
"epoch": 0.4428125171261029,
"grad_norm": 0.3183233272727026,
"learning_rate": 4.818648268046038e-06,
"loss": 0.6145,
"step": 505
},
{
"epoch": 0.4436893735956596,
"grad_norm": 0.27989457450916727,
"learning_rate": 4.817762653891957e-06,
"loss": 0.6095,
"step": 506
},
{
"epoch": 0.4445662300652162,
"grad_norm": 0.32106502207942483,
"learning_rate": 4.816874964382346e-06,
"loss": 0.6096,
"step": 507
},
{
"epoch": 0.4454430865347728,
"grad_norm": 0.2690675603747584,
"learning_rate": 4.815985200312057e-06,
"loss": 0.5986,
"step": 508
},
{
"epoch": 0.4463199430043295,
"grad_norm": 0.2818980909126885,
"learning_rate": 4.815093362477793e-06,
"loss": 0.6136,
"step": 509
},
{
"epoch": 0.44719679947388613,
"grad_norm": 0.29748447845455983,
"learning_rate": 4.8141994516781196e-06,
"loss": 0.6162,
"step": 510
},
{
"epoch": 0.4480736559434428,
"grad_norm": 0.3107094817046459,
"learning_rate": 4.813303468713456e-06,
"loss": 0.5939,
"step": 511
},
{
"epoch": 0.4489505124129994,
"grad_norm": 0.27493905192543294,
"learning_rate": 4.812405414386078e-06,
"loss": 0.6054,
"step": 512
},
{
"epoch": 0.44982736888255603,
"grad_norm": 0.28885594119974684,
"learning_rate": 4.811505289500113e-06,
"loss": 0.611,
"step": 513
},
{
"epoch": 0.4507042253521127,
"grad_norm": 0.2724458036095346,
"learning_rate": 4.810603094861548e-06,
"loss": 0.6296,
"step": 514
},
{
"epoch": 0.45158108182166934,
"grad_norm": 0.3171235548951884,
"learning_rate": 4.809698831278217e-06,
"loss": 0.6137,
"step": 515
},
{
"epoch": 0.45245793829122594,
"grad_norm": 0.2975607228468226,
"learning_rate": 4.808792499559812e-06,
"loss": 0.6081,
"step": 516
},
{
"epoch": 0.4533347947607826,
"grad_norm": 0.29553804453973653,
"learning_rate": 4.807884100517873e-06,
"loss": 0.6106,
"step": 517
},
{
"epoch": 0.45421165123033924,
"grad_norm": 0.29283068458115197,
"learning_rate": 4.8069736349657935e-06,
"loss": 0.6144,
"step": 518
},
{
"epoch": 0.4550885076998959,
"grad_norm": 0.3123674697628625,
"learning_rate": 4.806061103718816e-06,
"loss": 0.6024,
"step": 519
},
{
"epoch": 0.4559653641694525,
"grad_norm": 0.3185535504257689,
"learning_rate": 4.805146507594034e-06,
"loss": 0.6031,
"step": 520
},
{
"epoch": 0.45684222063900914,
"grad_norm": 0.32719458735857726,
"learning_rate": 4.804229847410388e-06,
"loss": 0.614,
"step": 521
},
{
"epoch": 0.4577190771085658,
"grad_norm": 0.2756686412179773,
"learning_rate": 4.803311123988668e-06,
"loss": 0.6143,
"step": 522
},
{
"epoch": 0.45859593357812245,
"grad_norm": 0.3193363571929515,
"learning_rate": 4.802390338151512e-06,
"loss": 0.5962,
"step": 523
},
{
"epoch": 0.45947279004767905,
"grad_norm": 0.27470129307670516,
"learning_rate": 4.801467490723402e-06,
"loss": 0.6118,
"step": 524
},
{
"epoch": 0.4603496465172357,
"grad_norm": 0.3268257836594815,
"learning_rate": 4.800542582530668e-06,
"loss": 0.6091,
"step": 525
},
{
"epoch": 0.46122650298679235,
"grad_norm": 0.2636715015821582,
"learning_rate": 4.799615614401488e-06,
"loss": 0.6113,
"step": 526
},
{
"epoch": 0.462103359456349,
"grad_norm": 0.3309929173426789,
"learning_rate": 4.79868658716588e-06,
"loss": 0.6063,
"step": 527
},
{
"epoch": 0.4629802159259056,
"grad_norm": 0.2705433155095911,
"learning_rate": 4.7977555016557054e-06,
"loss": 0.6115,
"step": 528
},
{
"epoch": 0.46385707239546226,
"grad_norm": 0.2986983107432822,
"learning_rate": 4.796822358704673e-06,
"loss": 0.624,
"step": 529
},
{
"epoch": 0.4647339288650189,
"grad_norm": 0.27153673858142124,
"learning_rate": 4.7958871591483305e-06,
"loss": 0.6144,
"step": 530
},
{
"epoch": 0.46561078533457556,
"grad_norm": 0.2774095045069063,
"learning_rate": 4.794949903824069e-06,
"loss": 0.6082,
"step": 531
},
{
"epoch": 0.4664876418041322,
"grad_norm": 0.28167525290961587,
"learning_rate": 4.794010593571118e-06,
"loss": 0.6106,
"step": 532
},
{
"epoch": 0.4673644982736888,
"grad_norm": 0.2626835693504621,
"learning_rate": 4.793069229230548e-06,
"loss": 0.6142,
"step": 533
},
{
"epoch": 0.46824135474324546,
"grad_norm": 0.27619948959341917,
"learning_rate": 4.792125811645271e-06,
"loss": 0.6073,
"step": 534
},
{
"epoch": 0.4691182112128021,
"grad_norm": 0.2913249262978291,
"learning_rate": 4.791180341660035e-06,
"loss": 0.6034,
"step": 535
},
{
"epoch": 0.46999506768235877,
"grad_norm": 0.2792318560656134,
"learning_rate": 4.790232820121426e-06,
"loss": 0.6002,
"step": 536
},
{
"epoch": 0.47087192415191537,
"grad_norm": 0.2690237732263836,
"learning_rate": 4.789283247877867e-06,
"loss": 0.6128,
"step": 537
},
{
"epoch": 0.471748780621472,
"grad_norm": 0.2875784864108413,
"learning_rate": 4.7883316257796195e-06,
"loss": 0.6125,
"step": 538
},
{
"epoch": 0.47262563709102867,
"grad_norm": 0.3494280106540881,
"learning_rate": 4.787377954678776e-06,
"loss": 0.6079,
"step": 539
},
{
"epoch": 0.4735024935605853,
"grad_norm": 0.27811345732659243,
"learning_rate": 4.786422235429269e-06,
"loss": 0.6118,
"step": 540
},
{
"epoch": 0.4743793500301419,
"grad_norm": 0.33921109846320074,
"learning_rate": 4.785464468886859e-06,
"loss": 0.6176,
"step": 541
},
{
"epoch": 0.4752562064996986,
"grad_norm": 0.29592545517880114,
"learning_rate": 4.784504655909146e-06,
"loss": 0.6131,
"step": 542
},
{
"epoch": 0.4761330629692552,
"grad_norm": 0.29373530511374163,
"learning_rate": 4.783542797355558e-06,
"loss": 0.6082,
"step": 543
},
{
"epoch": 0.4770099194388119,
"grad_norm": 0.2999691792256973,
"learning_rate": 4.782578894087357e-06,
"loss": 0.5981,
"step": 544
},
{
"epoch": 0.4778867759083685,
"grad_norm": 0.2694268894908227,
"learning_rate": 4.781612946967632e-06,
"loss": 0.6055,
"step": 545
},
{
"epoch": 0.47876363237792513,
"grad_norm": 0.2970836241532985,
"learning_rate": 4.780644956861307e-06,
"loss": 0.6002,
"step": 546
},
{
"epoch": 0.4796404888474818,
"grad_norm": 0.3413332201519291,
"learning_rate": 4.7796749246351335e-06,
"loss": 0.6103,
"step": 547
},
{
"epoch": 0.48051734531703844,
"grad_norm": 0.27732196553749033,
"learning_rate": 4.77870285115769e-06,
"loss": 0.5972,
"step": 548
},
{
"epoch": 0.48139420178659503,
"grad_norm": 0.32594912225980904,
"learning_rate": 4.777728737299387e-06,
"loss": 0.6275,
"step": 549
},
{
"epoch": 0.4822710582561517,
"grad_norm": 0.28158230943213153,
"learning_rate": 4.776752583932455e-06,
"loss": 0.6215,
"step": 550
},
{
"epoch": 0.48314791472570834,
"grad_norm": 0.3244722564822324,
"learning_rate": 4.775774391930956e-06,
"loss": 0.5947,
"step": 551
},
{
"epoch": 0.484024771195265,
"grad_norm": 0.26397208532030864,
"learning_rate": 4.774794162170777e-06,
"loss": 0.611,
"step": 552
},
{
"epoch": 0.4849016276648216,
"grad_norm": 0.2816890422555255,
"learning_rate": 4.773811895529629e-06,
"loss": 0.5942,
"step": 553
},
{
"epoch": 0.48577848413437824,
"grad_norm": 0.28224512879430635,
"learning_rate": 4.772827592887046e-06,
"loss": 0.5918,
"step": 554
},
{
"epoch": 0.4866553406039349,
"grad_norm": 0.2978578883597439,
"learning_rate": 4.771841255124385e-06,
"loss": 0.6031,
"step": 555
},
{
"epoch": 0.48753219707349155,
"grad_norm": 0.3212067488646109,
"learning_rate": 4.770852883124827e-06,
"loss": 0.6066,
"step": 556
},
{
"epoch": 0.4884090535430482,
"grad_norm": 0.3047898856904216,
"learning_rate": 4.769862477773374e-06,
"loss": 0.6097,
"step": 557
},
{
"epoch": 0.4892859100126048,
"grad_norm": 0.32816575436148626,
"learning_rate": 4.768870039956846e-06,
"loss": 0.6078,
"step": 558
},
{
"epoch": 0.49016276648216145,
"grad_norm": 0.30333447423661625,
"learning_rate": 4.767875570563887e-06,
"loss": 0.6103,
"step": 559
},
{
"epoch": 0.4910396229517181,
"grad_norm": 0.32463487013229164,
"learning_rate": 4.766879070484957e-06,
"loss": 0.5925,
"step": 560
},
{
"epoch": 0.49191647942127475,
"grad_norm": 0.27125555349656966,
"learning_rate": 4.765880540612336e-06,
"loss": 0.6095,
"step": 561
},
{
"epoch": 0.49279333589083135,
"grad_norm": 0.29571340419933284,
"learning_rate": 4.764879981840121e-06,
"loss": 0.6061,
"step": 562
},
{
"epoch": 0.493670192360388,
"grad_norm": 0.28779220439984465,
"learning_rate": 4.763877395064225e-06,
"loss": 0.6164,
"step": 563
},
{
"epoch": 0.49454704882994466,
"grad_norm": 0.3023002461106019,
"learning_rate": 4.762872781182378e-06,
"loss": 0.6099,
"step": 564
},
{
"epoch": 0.4954239052995013,
"grad_norm": 0.2852998688047179,
"learning_rate": 4.761866141094126e-06,
"loss": 0.6151,
"step": 565
},
{
"epoch": 0.4963007617690579,
"grad_norm": 0.27004415072990756,
"learning_rate": 4.7608574757008245e-06,
"loss": 0.6056,
"step": 566
},
{
"epoch": 0.49717761823861456,
"grad_norm": 0.26583697629837466,
"learning_rate": 4.759846785905649e-06,
"loss": 0.6073,
"step": 567
},
{
"epoch": 0.4980544747081712,
"grad_norm": 0.29963137609858226,
"learning_rate": 4.758834072613583e-06,
"loss": 0.6175,
"step": 568
},
{
"epoch": 0.49893133117772787,
"grad_norm": 0.2777428291092147,
"learning_rate": 4.757819336731424e-06,
"loss": 0.6084,
"step": 569
},
{
"epoch": 0.49980818764728446,
"grad_norm": 0.286537576055084,
"learning_rate": 4.756802579167781e-06,
"loss": 0.6122,
"step": 570
},
{
"epoch": 0.5006850441168411,
"grad_norm": 0.2900434750609322,
"learning_rate": 4.755783800833071e-06,
"loss": 0.61,
"step": 571
},
{
"epoch": 0.5015619005863977,
"grad_norm": 0.29602981997833644,
"learning_rate": 4.754763002639522e-06,
"loss": 0.5979,
"step": 572
},
{
"epoch": 0.5024387570559544,
"grad_norm": 0.2850500950921633,
"learning_rate": 4.75374018550117e-06,
"loss": 0.616,
"step": 573
},
{
"epoch": 0.503315613525511,
"grad_norm": 0.2747595431255721,
"learning_rate": 4.752715350333858e-06,
"loss": 0.6082,
"step": 574
},
{
"epoch": 0.5041924699950677,
"grad_norm": 0.30963433949041175,
"learning_rate": 4.75168849805524e-06,
"loss": 0.6062,
"step": 575
},
{
"epoch": 0.5050693264646243,
"grad_norm": 0.28817154630491854,
"learning_rate": 4.750659629584772e-06,
"loss": 0.615,
"step": 576
},
{
"epoch": 0.5059461829341809,
"grad_norm": 0.29777143797501865,
"learning_rate": 4.749628745843715e-06,
"loss": 0.6093,
"step": 577
},
{
"epoch": 0.5068230394037376,
"grad_norm": 0.2761328411528336,
"learning_rate": 4.748595847755137e-06,
"loss": 0.5949,
"step": 578
},
{
"epoch": 0.5076998958732942,
"grad_norm": 0.27941749417554973,
"learning_rate": 4.74756093624391e-06,
"loss": 0.6165,
"step": 579
},
{
"epoch": 0.5085767523428508,
"grad_norm": 0.28883681834919644,
"learning_rate": 4.746524012236706e-06,
"loss": 0.6012,
"step": 580
},
{
"epoch": 0.5094536088124075,
"grad_norm": 0.2712633209555587,
"learning_rate": 4.7454850766620005e-06,
"loss": 0.5898,
"step": 581
},
{
"epoch": 0.5103304652819641,
"grad_norm": 0.29386364789948854,
"learning_rate": 4.7444441304500714e-06,
"loss": 0.6057,
"step": 582
},
{
"epoch": 0.5112073217515208,
"grad_norm": 0.27998562308750735,
"learning_rate": 4.743401174532994e-06,
"loss": 0.597,
"step": 583
},
{
"epoch": 0.5120841782210774,
"grad_norm": 0.2944531079667381,
"learning_rate": 4.742356209844646e-06,
"loss": 0.5915,
"step": 584
},
{
"epoch": 0.512961034690634,
"grad_norm": 0.29506045387008756,
"learning_rate": 4.741309237320703e-06,
"loss": 0.6178,
"step": 585
},
{
"epoch": 0.5138378911601907,
"grad_norm": 0.299236621784075,
"learning_rate": 4.740260257898638e-06,
"loss": 0.6121,
"step": 586
},
{
"epoch": 0.5147147476297473,
"grad_norm": 0.303688650889379,
"learning_rate": 4.739209272517721e-06,
"loss": 0.5982,
"step": 587
},
{
"epoch": 0.515591604099304,
"grad_norm": 0.2925779066404172,
"learning_rate": 4.738156282119018e-06,
"loss": 0.5936,
"step": 588
},
{
"epoch": 0.5164684605688606,
"grad_norm": 0.3374725318718031,
"learning_rate": 4.73710128764539e-06,
"loss": 0.6001,
"step": 589
},
{
"epoch": 0.5173453170384172,
"grad_norm": 0.28811046561615106,
"learning_rate": 4.736044290041496e-06,
"loss": 0.61,
"step": 590
},
{
"epoch": 0.518222173507974,
"grad_norm": 0.32139851009391945,
"learning_rate": 4.7349852902537814e-06,
"loss": 0.5931,
"step": 591
},
{
"epoch": 0.5190990299775305,
"grad_norm": 0.27307295767087736,
"learning_rate": 4.733924289230493e-06,
"loss": 0.6035,
"step": 592
},
{
"epoch": 0.5199758864470871,
"grad_norm": 0.3098223534082736,
"learning_rate": 4.7328612879216615e-06,
"loss": 0.6082,
"step": 593
},
{
"epoch": 0.5208527429166439,
"grad_norm": 0.2808341207944162,
"learning_rate": 4.731796287279115e-06,
"loss": 0.5965,
"step": 594
},
{
"epoch": 0.5217295993862004,
"grad_norm": 0.3093125993326785,
"learning_rate": 4.730729288256468e-06,
"loss": 0.6018,
"step": 595
},
{
"epoch": 0.5226064558557572,
"grad_norm": 0.30147164249765196,
"learning_rate": 4.729660291809126e-06,
"loss": 0.6072,
"step": 596
},
{
"epoch": 0.5234833123253138,
"grad_norm": 0.2893545075475105,
"learning_rate": 4.728589298894284e-06,
"loss": 0.5894,
"step": 597
},
{
"epoch": 0.5243601687948704,
"grad_norm": 0.29778530349250987,
"learning_rate": 4.72751631047092e-06,
"loss": 0.5941,
"step": 598
},
{
"epoch": 0.5252370252644271,
"grad_norm": 0.2822751104373634,
"learning_rate": 4.726441327499805e-06,
"loss": 0.6056,
"step": 599
},
{
"epoch": 0.5261138817339837,
"grad_norm": 0.30381920940202223,
"learning_rate": 4.725364350943492e-06,
"loss": 0.6016,
"step": 600
},
{
"epoch": 0.5269907382035403,
"grad_norm": 0.2728312952142679,
"learning_rate": 4.72428538176632e-06,
"loss": 0.6033,
"step": 601
},
{
"epoch": 0.527867594673097,
"grad_norm": 0.2920360605636878,
"learning_rate": 4.723204420934413e-06,
"loss": 0.614,
"step": 602
},
{
"epoch": 0.5287444511426536,
"grad_norm": 0.282387818364113,
"learning_rate": 4.722121469415677e-06,
"loss": 0.5901,
"step": 603
},
{
"epoch": 0.5296213076122103,
"grad_norm": 0.2954181717364726,
"learning_rate": 4.721036528179802e-06,
"loss": 0.6043,
"step": 604
},
{
"epoch": 0.5304981640817669,
"grad_norm": 0.3084979402180987,
"learning_rate": 4.719949598198258e-06,
"loss": 0.5931,
"step": 605
},
{
"epoch": 0.5313750205513235,
"grad_norm": 0.3252699365181927,
"learning_rate": 4.718860680444297e-06,
"loss": 0.6181,
"step": 606
},
{
"epoch": 0.5322518770208802,
"grad_norm": 0.28357295095306256,
"learning_rate": 4.717769775892951e-06,
"loss": 0.5903,
"step": 607
},
{
"epoch": 0.5331287334904368,
"grad_norm": 0.3569079908279582,
"learning_rate": 4.7166768855210294e-06,
"loss": 0.5939,
"step": 608
},
{
"epoch": 0.5340055899599935,
"grad_norm": 0.31741200071485426,
"learning_rate": 4.715582010307121e-06,
"loss": 0.5897,
"step": 609
},
{
"epoch": 0.5348824464295501,
"grad_norm": 0.3218789245412814,
"learning_rate": 4.714485151231593e-06,
"loss": 0.5926,
"step": 610
},
{
"epoch": 0.5357593028991067,
"grad_norm": 0.2824610260583936,
"learning_rate": 4.713386309276585e-06,
"loss": 0.6039,
"step": 611
},
{
"epoch": 0.5366361593686634,
"grad_norm": 0.3111981063952015,
"learning_rate": 4.712285485426017e-06,
"loss": 0.6012,
"step": 612
},
{
"epoch": 0.53751301583822,
"grad_norm": 0.2719370118974663,
"learning_rate": 4.7111826806655804e-06,
"loss": 0.5912,
"step": 613
},
{
"epoch": 0.5383898723077766,
"grad_norm": 0.3161533458613161,
"learning_rate": 4.710077895982741e-06,
"loss": 0.5962,
"step": 614
},
{
"epoch": 0.5392667287773333,
"grad_norm": 0.26701338476822095,
"learning_rate": 4.708971132366739e-06,
"loss": 0.6025,
"step": 615
},
{
"epoch": 0.5401435852468899,
"grad_norm": 0.28447205168753736,
"learning_rate": 4.707862390808583e-06,
"loss": 0.5959,
"step": 616
},
{
"epoch": 0.5410204417164466,
"grad_norm": 0.26585350433139904,
"learning_rate": 4.706751672301058e-06,
"loss": 0.5946,
"step": 617
},
{
"epoch": 0.5418972981860032,
"grad_norm": 0.28276117956241253,
"learning_rate": 4.705638977838712e-06,
"loss": 0.5986,
"step": 618
},
{
"epoch": 0.5427741546555598,
"grad_norm": 0.2752743049051474,
"learning_rate": 4.704524308417872e-06,
"loss": 0.6044,
"step": 619
},
{
"epoch": 0.5436510111251165,
"grad_norm": 0.2744635750786116,
"learning_rate": 4.703407665036622e-06,
"loss": 0.6,
"step": 620
},
{
"epoch": 0.5445278675946731,
"grad_norm": 0.2942835089324837,
"learning_rate": 4.702289048694824e-06,
"loss": 0.6163,
"step": 621
},
{
"epoch": 0.5454047240642297,
"grad_norm": 0.29074004193212294,
"learning_rate": 4.7011684603940985e-06,
"loss": 0.61,
"step": 622
},
{
"epoch": 0.5462815805337864,
"grad_norm": 0.265548853050648,
"learning_rate": 4.700045901137838e-06,
"loss": 0.6003,
"step": 623
},
{
"epoch": 0.547158437003343,
"grad_norm": 0.28147341099339,
"learning_rate": 4.6989213719311956e-06,
"loss": 0.6057,
"step": 624
},
{
"epoch": 0.5480352934728997,
"grad_norm": 0.25061686481638634,
"learning_rate": 4.697794873781089e-06,
"loss": 0.6103,
"step": 625
},
{
"epoch": 0.5489121499424563,
"grad_norm": 0.28270079603778164,
"learning_rate": 4.696666407696201e-06,
"loss": 0.5999,
"step": 626
},
{
"epoch": 0.5497890064120129,
"grad_norm": 0.25832596909684546,
"learning_rate": 4.695535974686975e-06,
"loss": 0.5989,
"step": 627
},
{
"epoch": 0.5506658628815696,
"grad_norm": 0.28610489660664173,
"learning_rate": 4.694403575765615e-06,
"loss": 0.6039,
"step": 628
},
{
"epoch": 0.5515427193511262,
"grad_norm": 0.26039812165621273,
"learning_rate": 4.693269211946086e-06,
"loss": 0.5999,
"step": 629
},
{
"epoch": 0.5524195758206829,
"grad_norm": 0.2802813802636672,
"learning_rate": 4.692132884244113e-06,
"loss": 0.5957,
"step": 630
},
{
"epoch": 0.5532964322902395,
"grad_norm": 0.28045233973715045,
"learning_rate": 4.69099459367718e-06,
"loss": 0.6057,
"step": 631
},
{
"epoch": 0.5541732887597961,
"grad_norm": 0.2850165288729873,
"learning_rate": 4.689854341264525e-06,
"loss": 0.6062,
"step": 632
},
{
"epoch": 0.5550501452293528,
"grad_norm": 0.318532937146288,
"learning_rate": 4.688712128027147e-06,
"loss": 0.615,
"step": 633
},
{
"epoch": 0.5559270016989094,
"grad_norm": 0.2700297126701359,
"learning_rate": 4.687567954987798e-06,
"loss": 0.6027,
"step": 634
},
{
"epoch": 0.556803858168466,
"grad_norm": 0.2709567537114069,
"learning_rate": 4.686421823170987e-06,
"loss": 0.606,
"step": 635
},
{
"epoch": 0.5576807146380227,
"grad_norm": 0.30943308206128534,
"learning_rate": 4.685273733602975e-06,
"loss": 0.6122,
"step": 636
},
{
"epoch": 0.5585575711075793,
"grad_norm": 0.2866407684585244,
"learning_rate": 4.6841236873117765e-06,
"loss": 0.5983,
"step": 637
},
{
"epoch": 0.559434427577136,
"grad_norm": 0.30074858616349,
"learning_rate": 4.6829716853271576e-06,
"loss": 0.6112,
"step": 638
},
{
"epoch": 0.5603112840466926,
"grad_norm": 0.27481764632891953,
"learning_rate": 4.681817728680638e-06,
"loss": 0.5923,
"step": 639
},
{
"epoch": 0.5611881405162492,
"grad_norm": 0.30985792219487485,
"learning_rate": 4.680661818405485e-06,
"loss": 0.6083,
"step": 640
},
{
"epoch": 0.5620649969858059,
"grad_norm": 0.30548099410676144,
"learning_rate": 4.679503955536715e-06,
"loss": 0.6105,
"step": 641
},
{
"epoch": 0.5629418534553625,
"grad_norm": 0.27736446160459594,
"learning_rate": 4.678344141111096e-06,
"loss": 0.6176,
"step": 642
},
{
"epoch": 0.5638187099249191,
"grad_norm": 0.313370779146898,
"learning_rate": 4.6771823761671386e-06,
"loss": 0.6035,
"step": 643
},
{
"epoch": 0.5646955663944758,
"grad_norm": 0.27389315771120454,
"learning_rate": 4.676018661745104e-06,
"loss": 0.6118,
"step": 644
},
{
"epoch": 0.5655724228640324,
"grad_norm": 0.3272671136560007,
"learning_rate": 4.674852998886998e-06,
"loss": 0.6059,
"step": 645
},
{
"epoch": 0.5664492793335891,
"grad_norm": 0.29110434636858074,
"learning_rate": 4.6736853886365704e-06,
"loss": 0.5957,
"step": 646
},
{
"epoch": 0.5673261358031457,
"grad_norm": 0.27566640053494834,
"learning_rate": 4.672515832039315e-06,
"loss": 0.5847,
"step": 647
},
{
"epoch": 0.5682029922727023,
"grad_norm": 0.3439499837560115,
"learning_rate": 4.671344330142468e-06,
"loss": 0.6066,
"step": 648
},
{
"epoch": 0.569079848742259,
"grad_norm": 0.2831795036732806,
"learning_rate": 4.670170883995007e-06,
"loss": 0.5875,
"step": 649
},
{
"epoch": 0.5699567052118156,
"grad_norm": 0.3084275937304928,
"learning_rate": 4.668995494647653e-06,
"loss": 0.6046,
"step": 650
},
{
"epoch": 0.5708335616813722,
"grad_norm": 0.2876312566066635,
"learning_rate": 4.667818163152864e-06,
"loss": 0.609,
"step": 651
},
{
"epoch": 0.5717104181509289,
"grad_norm": 0.27641311480374825,
"learning_rate": 4.6666388905648394e-06,
"loss": 0.6084,
"step": 652
},
{
"epoch": 0.5725872746204855,
"grad_norm": 0.2760161681243495,
"learning_rate": 4.665457677939515e-06,
"loss": 0.6036,
"step": 653
},
{
"epoch": 0.5734641310900422,
"grad_norm": 0.2664014070652965,
"learning_rate": 4.664274526334563e-06,
"loss": 0.6047,
"step": 654
},
{
"epoch": 0.5743409875595988,
"grad_norm": 0.27367722811571643,
"learning_rate": 4.663089436809395e-06,
"loss": 0.607,
"step": 655
},
{
"epoch": 0.5752178440291554,
"grad_norm": 0.2971494077897638,
"learning_rate": 4.661902410425156e-06,
"loss": 0.5851,
"step": 656
},
{
"epoch": 0.5760947004987121,
"grad_norm": 0.28359506675344376,
"learning_rate": 4.660713448244723e-06,
"loss": 0.5911,
"step": 657
},
{
"epoch": 0.5769715569682687,
"grad_norm": 0.27646693971859265,
"learning_rate": 4.6595225513327105e-06,
"loss": 0.601,
"step": 658
},
{
"epoch": 0.5778484134378254,
"grad_norm": 0.2707379861432875,
"learning_rate": 4.658329720755464e-06,
"loss": 0.5905,
"step": 659
},
{
"epoch": 0.578725269907382,
"grad_norm": 0.301271851117793,
"learning_rate": 4.657134957581057e-06,
"loss": 0.6023,
"step": 660
},
{
"epoch": 0.5796021263769386,
"grad_norm": 0.30214846729641187,
"learning_rate": 4.6559382628793e-06,
"loss": 0.6095,
"step": 661
},
{
"epoch": 0.5804789828464954,
"grad_norm": 0.2880769859831512,
"learning_rate": 4.6547396377217265e-06,
"loss": 0.6012,
"step": 662
},
{
"epoch": 0.581355839316052,
"grad_norm": 0.3363251460755209,
"learning_rate": 4.653539083181603e-06,
"loss": 0.5963,
"step": 663
},
{
"epoch": 0.5822326957856085,
"grad_norm": 0.3446871487238731,
"learning_rate": 4.652336600333921e-06,
"loss": 0.5992,
"step": 664
},
{
"epoch": 0.5831095522551653,
"grad_norm": 0.3016824402176579,
"learning_rate": 4.651132190255401e-06,
"loss": 0.6016,
"step": 665
},
{
"epoch": 0.5839864087247219,
"grad_norm": 0.31791554379394255,
"learning_rate": 4.649925854024486e-06,
"loss": 0.5943,
"step": 666
},
{
"epoch": 0.5848632651942786,
"grad_norm": 0.3603510668723624,
"learning_rate": 4.648717592721347e-06,
"loss": 0.6086,
"step": 667
},
{
"epoch": 0.5857401216638352,
"grad_norm": 0.25073578292290827,
"learning_rate": 4.647507407427877e-06,
"loss": 0.5965,
"step": 668
},
{
"epoch": 0.5866169781333918,
"grad_norm": 0.3401292596267892,
"learning_rate": 4.646295299227691e-06,
"loss": 0.5896,
"step": 669
},
{
"epoch": 0.5874938346029485,
"grad_norm": 0.26798950974238206,
"learning_rate": 4.645081269206128e-06,
"loss": 0.5913,
"step": 670
},
{
"epoch": 0.5883706910725051,
"grad_norm": 0.2712753517614824,
"learning_rate": 4.643865318450247e-06,
"loss": 0.5948,
"step": 671
},
{
"epoch": 0.5892475475420617,
"grad_norm": 0.31478669896326056,
"learning_rate": 4.642647448048824e-06,
"loss": 0.6036,
"step": 672
},
{
"epoch": 0.5901244040116184,
"grad_norm": 0.2853149586152437,
"learning_rate": 4.641427659092359e-06,
"loss": 0.5852,
"step": 673
},
{
"epoch": 0.591001260481175,
"grad_norm": 0.31928733056145026,
"learning_rate": 4.6402059526730656e-06,
"loss": 0.596,
"step": 674
},
{
"epoch": 0.5918781169507317,
"grad_norm": 0.28886504451895006,
"learning_rate": 4.638982329884878e-06,
"loss": 0.5867,
"step": 675
},
{
"epoch": 0.5927549734202883,
"grad_norm": 0.34332786639440344,
"learning_rate": 4.637756791823443e-06,
"loss": 0.5951,
"step": 676
},
{
"epoch": 0.5936318298898449,
"grad_norm": 0.31536294202913445,
"learning_rate": 4.6365293395861225e-06,
"loss": 0.6005,
"step": 677
},
{
"epoch": 0.5945086863594016,
"grad_norm": 0.36612645695214535,
"learning_rate": 4.6352999742719954e-06,
"loss": 0.6125,
"step": 678
},
{
"epoch": 0.5953855428289582,
"grad_norm": 0.2865910172606529,
"learning_rate": 4.634068696981852e-06,
"loss": 0.6096,
"step": 679
},
{
"epoch": 0.5962623992985149,
"grad_norm": 0.3077121674916666,
"learning_rate": 4.632835508818192e-06,
"loss": 0.5891,
"step": 680
},
{
"epoch": 0.5971392557680715,
"grad_norm": 0.2930520316480949,
"learning_rate": 4.631600410885231e-06,
"loss": 0.5918,
"step": 681
},
{
"epoch": 0.5980161122376281,
"grad_norm": 0.3412197822800723,
"learning_rate": 4.630363404288891e-06,
"loss": 0.5998,
"step": 682
},
{
"epoch": 0.5988929687071848,
"grad_norm": 0.2869686807201651,
"learning_rate": 4.629124490136804e-06,
"loss": 0.5952,
"step": 683
},
{
"epoch": 0.5997698251767414,
"grad_norm": 0.3044523168792968,
"learning_rate": 4.627883669538311e-06,
"loss": 0.6058,
"step": 684
},
{
"epoch": 0.600646681646298,
"grad_norm": 0.298754941767322,
"learning_rate": 4.626640943604459e-06,
"loss": 0.6099,
"step": 685
},
{
"epoch": 0.6015235381158547,
"grad_norm": 0.30823608651620477,
"learning_rate": 4.625396313448e-06,
"loss": 0.5913,
"step": 686
},
{
"epoch": 0.6024003945854113,
"grad_norm": 0.2745802532714142,
"learning_rate": 4.624149780183395e-06,
"loss": 0.5904,
"step": 687
},
{
"epoch": 0.603277251054968,
"grad_norm": 0.2894557068485525,
"learning_rate": 4.622901344926805e-06,
"loss": 0.6006,
"step": 688
},
{
"epoch": 0.6041541075245246,
"grad_norm": 0.2844643276622375,
"learning_rate": 4.621651008796095e-06,
"loss": 0.5972,
"step": 689
},
{
"epoch": 0.6050309639940812,
"grad_norm": 0.3111750841694179,
"learning_rate": 4.620398772910833e-06,
"loss": 0.5911,
"step": 690
},
{
"epoch": 0.6059078204636379,
"grad_norm": 0.30229136138256857,
"learning_rate": 4.619144638392289e-06,
"loss": 0.6063,
"step": 691
},
{
"epoch": 0.6067846769331945,
"grad_norm": 0.2903177693650587,
"learning_rate": 4.6178886063634295e-06,
"loss": 0.6022,
"step": 692
},
{
"epoch": 0.6076615334027511,
"grad_norm": 0.29466063932438424,
"learning_rate": 4.616630677948924e-06,
"loss": 0.609,
"step": 693
},
{
"epoch": 0.6085383898723078,
"grad_norm": 0.29795014881552045,
"learning_rate": 4.615370854275138e-06,
"loss": 0.5923,
"step": 694
},
{
"epoch": 0.6094152463418644,
"grad_norm": 0.2835342651327551,
"learning_rate": 4.614109136470133e-06,
"loss": 0.5941,
"step": 695
},
{
"epoch": 0.6102921028114211,
"grad_norm": 0.2914927284695803,
"learning_rate": 4.612845525663671e-06,
"loss": 0.5915,
"step": 696
},
{
"epoch": 0.6111689592809777,
"grad_norm": 0.27150994490869584,
"learning_rate": 4.611580022987202e-06,
"loss": 0.5903,
"step": 697
},
{
"epoch": 0.6120458157505343,
"grad_norm": 0.27427922033901636,
"learning_rate": 4.610312629573877e-06,
"loss": 0.5826,
"step": 698
},
{
"epoch": 0.612922672220091,
"grad_norm": 0.3257835351903302,
"learning_rate": 4.609043346558536e-06,
"loss": 0.608,
"step": 699
},
{
"epoch": 0.6137995286896476,
"grad_norm": 0.27542786817313375,
"learning_rate": 4.607772175077712e-06,
"loss": 0.5914,
"step": 700
},
{
"epoch": 0.6146763851592043,
"grad_norm": 0.32541464673918596,
"learning_rate": 4.606499116269628e-06,
"loss": 0.6004,
"step": 701
},
{
"epoch": 0.6155532416287609,
"grad_norm": 0.2775394483279354,
"learning_rate": 4.605224171274198e-06,
"loss": 0.6042,
"step": 702
},
{
"epoch": 0.6164300980983175,
"grad_norm": 0.3010566442707075,
"learning_rate": 4.603947341233026e-06,
"loss": 0.5893,
"step": 703
},
{
"epoch": 0.6173069545678742,
"grad_norm": 0.28841806172316603,
"learning_rate": 4.602668627289401e-06,
"loss": 0.5932,
"step": 704
},
{
"epoch": 0.6181838110374308,
"grad_norm": 0.32720143492110876,
"learning_rate": 4.601388030588303e-06,
"loss": 0.594,
"step": 705
},
{
"epoch": 0.6190606675069874,
"grad_norm": 0.2629157828769276,
"learning_rate": 4.600105552276393e-06,
"loss": 0.5962,
"step": 706
},
{
"epoch": 0.6199375239765441,
"grad_norm": 0.2976311641314985,
"learning_rate": 4.598821193502019e-06,
"loss": 0.5993,
"step": 707
},
{
"epoch": 0.6208143804461007,
"grad_norm": 0.3223849407278096,
"learning_rate": 4.597534955415214e-06,
"loss": 0.6023,
"step": 708
},
{
"epoch": 0.6216912369156574,
"grad_norm": 0.3228934470983084,
"learning_rate": 4.596246839167692e-06,
"loss": 0.6058,
"step": 709
},
{
"epoch": 0.622568093385214,
"grad_norm": 0.2842350311614894,
"learning_rate": 4.59495684591285e-06,
"loss": 0.5965,
"step": 710
},
{
"epoch": 0.6234449498547706,
"grad_norm": 0.30037127301855626,
"learning_rate": 4.593664976805765e-06,
"loss": 0.5912,
"step": 711
},
{
"epoch": 0.6243218063243273,
"grad_norm": 0.29537031301186273,
"learning_rate": 4.592371233003195e-06,
"loss": 0.5847,
"step": 712
},
{
"epoch": 0.6251986627938839,
"grad_norm": 0.3099776656835445,
"learning_rate": 4.5910756156635725e-06,
"loss": 0.6061,
"step": 713
},
{
"epoch": 0.6260755192634405,
"grad_norm": 0.3343474177937486,
"learning_rate": 4.589778125947012e-06,
"loss": 0.5775,
"step": 714
},
{
"epoch": 0.6269523757329972,
"grad_norm": 0.26492597760028275,
"learning_rate": 4.588478765015304e-06,
"loss": 0.6008,
"step": 715
},
{
"epoch": 0.6278292322025538,
"grad_norm": 0.2996728173414987,
"learning_rate": 4.587177534031914e-06,
"loss": 0.5868,
"step": 716
},
{
"epoch": 0.6287060886721105,
"grad_norm": 0.269698012084879,
"learning_rate": 4.585874434161979e-06,
"loss": 0.5908,
"step": 717
},
{
"epoch": 0.6295829451416671,
"grad_norm": 0.3120812259438331,
"learning_rate": 4.584569466572313e-06,
"loss": 0.5964,
"step": 718
},
{
"epoch": 0.6304598016112237,
"grad_norm": 0.306605213663903,
"learning_rate": 4.583262632431402e-06,
"loss": 0.587,
"step": 719
},
{
"epoch": 0.6313366580807804,
"grad_norm": 0.31045769873517814,
"learning_rate": 4.581953932909403e-06,
"loss": 0.5924,
"step": 720
},
{
"epoch": 0.632213514550337,
"grad_norm": 0.30956000847409926,
"learning_rate": 4.580643369178142e-06,
"loss": 0.5905,
"step": 721
},
{
"epoch": 0.6330903710198936,
"grad_norm": 0.2980650280091205,
"learning_rate": 4.579330942411115e-06,
"loss": 0.5961,
"step": 722
},
{
"epoch": 0.6339672274894503,
"grad_norm": 0.2784986194522932,
"learning_rate": 4.578016653783488e-06,
"loss": 0.5962,
"step": 723
},
{
"epoch": 0.6348440839590069,
"grad_norm": 0.32816601752120567,
"learning_rate": 4.57670050447209e-06,
"loss": 0.6149,
"step": 724
},
{
"epoch": 0.6357209404285636,
"grad_norm": 0.2822290286934802,
"learning_rate": 4.575382495655421e-06,
"loss": 0.5915,
"step": 725
},
{
"epoch": 0.6365977968981202,
"grad_norm": 0.2993973936416954,
"learning_rate": 4.574062628513643e-06,
"loss": 0.59,
"step": 726
},
{
"epoch": 0.6374746533676768,
"grad_norm": 0.27875804168057794,
"learning_rate": 4.572740904228582e-06,
"loss": 0.6018,
"step": 727
},
{
"epoch": 0.6383515098372335,
"grad_norm": 0.3144256132274513,
"learning_rate": 4.571417323983727e-06,
"loss": 0.6056,
"step": 728
},
{
"epoch": 0.6392283663067901,
"grad_norm": 0.2763723528672814,
"learning_rate": 4.570091888964231e-06,
"loss": 0.5943,
"step": 729
},
{
"epoch": 0.6401052227763468,
"grad_norm": 0.3001278571328794,
"learning_rate": 4.5687646003569055e-06,
"loss": 0.588,
"step": 730
},
{
"epoch": 0.6409820792459034,
"grad_norm": 0.2847820308061442,
"learning_rate": 4.567435459350222e-06,
"loss": 0.5971,
"step": 731
},
{
"epoch": 0.64185893571546,
"grad_norm": 0.292512543142512,
"learning_rate": 4.566104467134311e-06,
"loss": 0.5864,
"step": 732
},
{
"epoch": 0.6427357921850168,
"grad_norm": 0.28968651062565176,
"learning_rate": 4.564771624900961e-06,
"loss": 0.62,
"step": 733
},
{
"epoch": 0.6436126486545733,
"grad_norm": 0.3004795852693458,
"learning_rate": 4.563436933843617e-06,
"loss": 0.5964,
"step": 734
},
{
"epoch": 0.64448950512413,
"grad_norm": 0.2865806085716862,
"learning_rate": 4.562100395157379e-06,
"loss": 0.6026,
"step": 735
},
{
"epoch": 0.6453663615936867,
"grad_norm": 0.2842649974188147,
"learning_rate": 4.560762010039001e-06,
"loss": 0.5913,
"step": 736
},
{
"epoch": 0.6462432180632433,
"grad_norm": 0.28683866497814775,
"learning_rate": 4.5594217796868915e-06,
"loss": 0.5951,
"step": 737
},
{
"epoch": 0.6471200745328,
"grad_norm": 0.2764873070461295,
"learning_rate": 4.558079705301109e-06,
"loss": 0.6053,
"step": 738
},
{
"epoch": 0.6479969310023566,
"grad_norm": 0.27004479414645,
"learning_rate": 4.556735788083366e-06,
"loss": 0.6039,
"step": 739
},
{
"epoch": 0.6488737874719132,
"grad_norm": 0.29052397029213667,
"learning_rate": 4.555390029237026e-06,
"loss": 0.601,
"step": 740
},
{
"epoch": 0.6497506439414699,
"grad_norm": 0.2947691340138793,
"learning_rate": 4.554042429967095e-06,
"loss": 0.6025,
"step": 741
},
{
"epoch": 0.6506275004110265,
"grad_norm": 0.2792458027197797,
"learning_rate": 4.552692991480234e-06,
"loss": 0.6014,
"step": 742
},
{
"epoch": 0.6515043568805831,
"grad_norm": 0.3382217380230472,
"learning_rate": 4.551341714984748e-06,
"loss": 0.5955,
"step": 743
},
{
"epoch": 0.6523812133501398,
"grad_norm": 0.2966197192699023,
"learning_rate": 4.549988601690588e-06,
"loss": 0.5935,
"step": 744
},
{
"epoch": 0.6532580698196964,
"grad_norm": 0.31516646846151397,
"learning_rate": 4.54863365280935e-06,
"loss": 0.597,
"step": 745
},
{
"epoch": 0.6541349262892531,
"grad_norm": 0.28496714910224397,
"learning_rate": 4.547276869554272e-06,
"loss": 0.5814,
"step": 746
},
{
"epoch": 0.6550117827588097,
"grad_norm": 0.30669749001026353,
"learning_rate": 4.545918253140236e-06,
"loss": 0.5952,
"step": 747
},
{
"epoch": 0.6558886392283663,
"grad_norm": 0.2812261666412913,
"learning_rate": 4.544557804783765e-06,
"loss": 0.6162,
"step": 748
},
{
"epoch": 0.656765495697923,
"grad_norm": 0.27761745178740765,
"learning_rate": 4.543195525703024e-06,
"loss": 0.5807,
"step": 749
},
{
"epoch": 0.6576423521674796,
"grad_norm": 0.31002121863979637,
"learning_rate": 4.541831417117815e-06,
"loss": 0.5851,
"step": 750
},
{
"epoch": 0.6585192086370363,
"grad_norm": 0.29034303454873894,
"learning_rate": 4.540465480249579e-06,
"loss": 0.6019,
"step": 751
},
{
"epoch": 0.6593960651065929,
"grad_norm": 0.30559901683462565,
"learning_rate": 4.539097716321394e-06,
"loss": 0.5866,
"step": 752
},
{
"epoch": 0.6602729215761495,
"grad_norm": 0.2641221990159659,
"learning_rate": 4.537728126557974e-06,
"loss": 0.5972,
"step": 753
},
{
"epoch": 0.6611497780457062,
"grad_norm": 0.3227708789669896,
"learning_rate": 4.536356712185668e-06,
"loss": 0.5796,
"step": 754
},
{
"epoch": 0.6620266345152628,
"grad_norm": 0.294701481555053,
"learning_rate": 4.534983474432458e-06,
"loss": 0.6149,
"step": 755
},
{
"epoch": 0.6629034909848194,
"grad_norm": 0.32377533070879033,
"learning_rate": 4.533608414527961e-06,
"loss": 0.5891,
"step": 756
},
{
"epoch": 0.6637803474543761,
"grad_norm": 0.3042889879699245,
"learning_rate": 4.532231533703423e-06,
"loss": 0.5913,
"step": 757
},
{
"epoch": 0.6646572039239327,
"grad_norm": 0.31760559251266973,
"learning_rate": 4.53085283319172e-06,
"loss": 0.6096,
"step": 758
},
{
"epoch": 0.6655340603934894,
"grad_norm": 0.3078941609749165,
"learning_rate": 4.529472314227362e-06,
"loss": 0.5905,
"step": 759
},
{
"epoch": 0.666410916863046,
"grad_norm": 0.30990175786815527,
"learning_rate": 4.528089978046481e-06,
"loss": 0.5991,
"step": 760
},
{
"epoch": 0.6672877733326026,
"grad_norm": 0.32903820758007046,
"learning_rate": 4.5267058258868414e-06,
"loss": 0.5882,
"step": 761
},
{
"epoch": 0.6681646298021593,
"grad_norm": 0.29452587669480845,
"learning_rate": 4.52531985898783e-06,
"loss": 0.5803,
"step": 762
},
{
"epoch": 0.6690414862717159,
"grad_norm": 0.30776706716693625,
"learning_rate": 4.52393207859046e-06,
"loss": 0.577,
"step": 763
},
{
"epoch": 0.6699183427412725,
"grad_norm": 0.31422641761257675,
"learning_rate": 4.522542485937369e-06,
"loss": 0.6018,
"step": 764
},
{
"epoch": 0.6707951992108292,
"grad_norm": 0.3173718550935184,
"learning_rate": 4.521151082272817e-06,
"loss": 0.5882,
"step": 765
},
{
"epoch": 0.6716720556803858,
"grad_norm": 0.2986562015643124,
"learning_rate": 4.519757868842685e-06,
"loss": 0.579,
"step": 766
},
{
"epoch": 0.6725489121499425,
"grad_norm": 0.3090764441547647,
"learning_rate": 4.518362846894475e-06,
"loss": 0.5985,
"step": 767
},
{
"epoch": 0.6734257686194991,
"grad_norm": 0.30790241933986734,
"learning_rate": 4.516966017677308e-06,
"loss": 0.5863,
"step": 768
},
{
"epoch": 0.6743026250890557,
"grad_norm": 0.2994056106304016,
"learning_rate": 4.515567382441923e-06,
"loss": 0.5991,
"step": 769
},
{
"epoch": 0.6751794815586124,
"grad_norm": 0.2958764046270931,
"learning_rate": 4.514166942440679e-06,
"loss": 0.5963,
"step": 770
},
{
"epoch": 0.676056338028169,
"grad_norm": 0.28788185549499157,
"learning_rate": 4.512764698927545e-06,
"loss": 0.6064,
"step": 771
},
{
"epoch": 0.6769331944977256,
"grad_norm": 0.29708423016925406,
"learning_rate": 4.511360653158111e-06,
"loss": 0.5947,
"step": 772
},
{
"epoch": 0.6778100509672823,
"grad_norm": 0.30991902940049315,
"learning_rate": 4.509954806389577e-06,
"loss": 0.5987,
"step": 773
},
{
"epoch": 0.6786869074368389,
"grad_norm": 0.2873916475278516,
"learning_rate": 4.508547159880758e-06,
"loss": 0.5924,
"step": 774
},
{
"epoch": 0.6795637639063956,
"grad_norm": 0.3007245570293541,
"learning_rate": 4.50713771489208e-06,
"loss": 0.6015,
"step": 775
},
{
"epoch": 0.6804406203759522,
"grad_norm": 0.30867041078073276,
"learning_rate": 4.505726472685577e-06,
"loss": 0.5957,
"step": 776
},
{
"epoch": 0.6813174768455088,
"grad_norm": 0.31345922212682475,
"learning_rate": 4.504313434524894e-06,
"loss": 0.6006,
"step": 777
},
{
"epoch": 0.6821943333150655,
"grad_norm": 0.29707717549610757,
"learning_rate": 4.502898601675285e-06,
"loss": 0.5778,
"step": 778
},
{
"epoch": 0.6830711897846221,
"grad_norm": 0.3796068136152165,
"learning_rate": 4.501481975403611e-06,
"loss": 0.5991,
"step": 779
},
{
"epoch": 0.6839480462541788,
"grad_norm": 0.28337342976468866,
"learning_rate": 4.5000635569783365e-06,
"loss": 0.5948,
"step": 780
},
{
"epoch": 0.6848249027237354,
"grad_norm": 0.31230108669893153,
"learning_rate": 4.498643347669533e-06,
"loss": 0.5925,
"step": 781
},
{
"epoch": 0.685701759193292,
"grad_norm": 0.27904331433791485,
"learning_rate": 4.497221348748874e-06,
"loss": 0.5916,
"step": 782
},
{
"epoch": 0.6865786156628487,
"grad_norm": 0.2942542969448629,
"learning_rate": 4.4957975614896386e-06,
"loss": 0.5992,
"step": 783
},
{
"epoch": 0.6874554721324053,
"grad_norm": 0.2908765617548673,
"learning_rate": 4.494371987166703e-06,
"loss": 0.6065,
"step": 784
},
{
"epoch": 0.6883323286019619,
"grad_norm": 0.2840490179126863,
"learning_rate": 4.492944627056544e-06,
"loss": 0.5902,
"step": 785
},
{
"epoch": 0.6892091850715186,
"grad_norm": 0.2727369127304506,
"learning_rate": 4.491515482437242e-06,
"loss": 0.5867,
"step": 786
},
{
"epoch": 0.6900860415410752,
"grad_norm": 0.28769481832954025,
"learning_rate": 4.4900845545884695e-06,
"loss": 0.5922,
"step": 787
},
{
"epoch": 0.6909628980106319,
"grad_norm": 0.2906309237155975,
"learning_rate": 4.4886518447915e-06,
"loss": 0.5887,
"step": 788
},
{
"epoch": 0.6918397544801885,
"grad_norm": 0.2948842293422461,
"learning_rate": 4.487217354329201e-06,
"loss": 0.6006,
"step": 789
},
{
"epoch": 0.6927166109497451,
"grad_norm": 0.302074977476922,
"learning_rate": 4.4857810844860325e-06,
"loss": 0.5866,
"step": 790
},
{
"epoch": 0.6935934674193018,
"grad_norm": 0.32893770275300094,
"learning_rate": 4.484343036548051e-06,
"loss": 0.5976,
"step": 791
},
{
"epoch": 0.6944703238888584,
"grad_norm": 0.2778002794834819,
"learning_rate": 4.482903211802904e-06,
"loss": 0.584,
"step": 792
},
{
"epoch": 0.695347180358415,
"grad_norm": 0.294631010190205,
"learning_rate": 4.481461611539829e-06,
"loss": 0.5796,
"step": 793
},
{
"epoch": 0.6962240368279717,
"grad_norm": 0.26497721691156156,
"learning_rate": 4.480018237049655e-06,
"loss": 0.5921,
"step": 794
},
{
"epoch": 0.6971008932975283,
"grad_norm": 0.2571147884128945,
"learning_rate": 4.4785730896247985e-06,
"loss": 0.5967,
"step": 795
},
{
"epoch": 0.697977749767085,
"grad_norm": 0.27928133327664356,
"learning_rate": 4.477126170559262e-06,
"loss": 0.5933,
"step": 796
},
{
"epoch": 0.6988546062366416,
"grad_norm": 0.2678842819485542,
"learning_rate": 4.475677481148638e-06,
"loss": 0.6041,
"step": 797
},
{
"epoch": 0.6997314627061982,
"grad_norm": 0.2891606093702898,
"learning_rate": 4.474227022690102e-06,
"loss": 0.5957,
"step": 798
},
{
"epoch": 0.700608319175755,
"grad_norm": 0.288045727848727,
"learning_rate": 4.4727747964824135e-06,
"loss": 0.5904,
"step": 799
},
{
"epoch": 0.7014851756453115,
"grad_norm": 0.31585634496103415,
"learning_rate": 4.471320803825915e-06,
"loss": 0.5976,
"step": 800
},
{
"epoch": 0.7023620321148683,
"grad_norm": 0.2748185200755283,
"learning_rate": 4.469865046022531e-06,
"loss": 0.5752,
"step": 801
},
{
"epoch": 0.7032388885844248,
"grad_norm": 0.3355774877957403,
"learning_rate": 4.468407524375767e-06,
"loss": 0.5983,
"step": 802
},
{
"epoch": 0.7041157450539814,
"grad_norm": 0.29100988533473726,
"learning_rate": 4.466948240190707e-06,
"loss": 0.5942,
"step": 803
},
{
"epoch": 0.7049926015235382,
"grad_norm": 0.32395113661904446,
"learning_rate": 4.465487194774012e-06,
"loss": 0.5934,
"step": 804
},
{
"epoch": 0.7058694579930948,
"grad_norm": 0.27010926989878575,
"learning_rate": 4.464024389433924e-06,
"loss": 0.5965,
"step": 805
},
{
"epoch": 0.7067463144626513,
"grad_norm": 0.31589368881558894,
"learning_rate": 4.462559825480257e-06,
"loss": 0.5892,
"step": 806
},
{
"epoch": 0.7076231709322081,
"grad_norm": 0.2696414843727876,
"learning_rate": 4.461093504224401e-06,
"loss": 0.5995,
"step": 807
},
{
"epoch": 0.7085000274017647,
"grad_norm": 0.2953330107498836,
"learning_rate": 4.459625426979319e-06,
"loss": 0.5918,
"step": 808
},
{
"epoch": 0.7093768838713214,
"grad_norm": 0.281894292123873,
"learning_rate": 4.458155595059549e-06,
"loss": 0.5955,
"step": 809
},
{
"epoch": 0.710253740340878,
"grad_norm": 0.27376761478776995,
"learning_rate": 4.4566840097811956e-06,
"loss": 0.5871,
"step": 810
},
{
"epoch": 0.7111305968104346,
"grad_norm": 0.27713167306531405,
"learning_rate": 4.455210672461938e-06,
"loss": 0.595,
"step": 811
},
{
"epoch": 0.7120074532799913,
"grad_norm": 0.27385713088626723,
"learning_rate": 4.453735584421021e-06,
"loss": 0.5899,
"step": 812
},
{
"epoch": 0.7128843097495479,
"grad_norm": 0.29840396727897567,
"learning_rate": 4.452258746979258e-06,
"loss": 0.5844,
"step": 813
},
{
"epoch": 0.7137611662191045,
"grad_norm": 0.28333795883109736,
"learning_rate": 4.4507801614590285e-06,
"loss": 0.5939,
"step": 814
},
{
"epoch": 0.7146380226886612,
"grad_norm": 0.3089268512848077,
"learning_rate": 4.449299829184278e-06,
"loss": 0.5859,
"step": 815
},
{
"epoch": 0.7155148791582178,
"grad_norm": 0.2808961599877815,
"learning_rate": 4.447817751480516e-06,
"loss": 0.5871,
"step": 816
},
{
"epoch": 0.7163917356277745,
"grad_norm": 0.30287533725577037,
"learning_rate": 4.446333929674816e-06,
"loss": 0.593,
"step": 817
},
{
"epoch": 0.7172685920973311,
"grad_norm": 0.30584446638710266,
"learning_rate": 4.444848365095809e-06,
"loss": 0.5917,
"step": 818
},
{
"epoch": 0.7181454485668877,
"grad_norm": 0.27241453105670504,
"learning_rate": 4.44336105907369e-06,
"loss": 0.5896,
"step": 819
},
{
"epoch": 0.7190223050364444,
"grad_norm": 0.36474064413319707,
"learning_rate": 4.4418720129402145e-06,
"loss": 0.5861,
"step": 820
},
{
"epoch": 0.719899161506001,
"grad_norm": 0.2832577542195539,
"learning_rate": 4.4403812280286915e-06,
"loss": 0.5905,
"step": 821
},
{
"epoch": 0.7207760179755577,
"grad_norm": 0.32117553322486775,
"learning_rate": 4.4388887056739926e-06,
"loss": 0.5801,
"step": 822
},
{
"epoch": 0.7216528744451143,
"grad_norm": 0.27537463782509236,
"learning_rate": 4.43739444721254e-06,
"loss": 0.587,
"step": 823
},
{
"epoch": 0.7225297309146709,
"grad_norm": 0.3274304411602489,
"learning_rate": 4.435898453982313e-06,
"loss": 0.6024,
"step": 824
},
{
"epoch": 0.7234065873842276,
"grad_norm": 0.3232032167824163,
"learning_rate": 4.434400727322844e-06,
"loss": 0.6145,
"step": 825
},
{
"epoch": 0.7242834438537842,
"grad_norm": 0.3431783037261662,
"learning_rate": 4.432901268575218e-06,
"loss": 0.5937,
"step": 826
},
{
"epoch": 0.7251603003233408,
"grad_norm": 0.30897032551229503,
"learning_rate": 4.43140007908207e-06,
"loss": 0.598,
"step": 827
},
{
"epoch": 0.7260371567928975,
"grad_norm": 0.2934772547759602,
"learning_rate": 4.429897160187584e-06,
"loss": 0.5918,
"step": 828
},
{
"epoch": 0.7269140132624541,
"grad_norm": 0.31389790755569874,
"learning_rate": 4.4283925132374946e-06,
"loss": 0.5832,
"step": 829
},
{
"epoch": 0.7277908697320108,
"grad_norm": 0.29548260652561004,
"learning_rate": 4.426886139579083e-06,
"loss": 0.5937,
"step": 830
},
{
"epoch": 0.7286677262015674,
"grad_norm": 0.3162599265610075,
"learning_rate": 4.425378040561175e-06,
"loss": 0.5889,
"step": 831
},
{
"epoch": 0.729544582671124,
"grad_norm": 0.3057143041654656,
"learning_rate": 4.423868217534144e-06,
"loss": 0.5848,
"step": 832
},
{
"epoch": 0.7304214391406807,
"grad_norm": 0.29540394945672244,
"learning_rate": 4.4223566718499055e-06,
"loss": 0.5926,
"step": 833
},
{
"epoch": 0.7312982956102373,
"grad_norm": 0.30681513325771914,
"learning_rate": 4.420843404861917e-06,
"loss": 0.5838,
"step": 834
},
{
"epoch": 0.7321751520797939,
"grad_norm": 0.29780757398255076,
"learning_rate": 4.419328417925177e-06,
"loss": 0.5922,
"step": 835
},
{
"epoch": 0.7330520085493506,
"grad_norm": 0.28283439818927025,
"learning_rate": 4.417811712396226e-06,
"loss": 0.5875,
"step": 836
},
{
"epoch": 0.7339288650189072,
"grad_norm": 0.30029201304931724,
"learning_rate": 4.416293289633144e-06,
"loss": 0.5989,
"step": 837
},
{
"epoch": 0.7348057214884639,
"grad_norm": 0.29188774973524867,
"learning_rate": 4.414773150995543e-06,
"loss": 0.5878,
"step": 838
},
{
"epoch": 0.7356825779580205,
"grad_norm": 0.3037257039566602,
"learning_rate": 4.413251297844579e-06,
"loss": 0.5849,
"step": 839
},
{
"epoch": 0.7365594344275771,
"grad_norm": 0.31802355671271254,
"learning_rate": 4.411727731542937e-06,
"loss": 0.5873,
"step": 840
},
{
"epoch": 0.7374362908971338,
"grad_norm": 0.31892860544931334,
"learning_rate": 4.410202453454841e-06,
"loss": 0.5784,
"step": 841
},
{
"epoch": 0.7383131473666904,
"grad_norm": 0.31731371407494563,
"learning_rate": 4.408675464946043e-06,
"loss": 0.5973,
"step": 842
},
{
"epoch": 0.739190003836247,
"grad_norm": 0.2807004884396655,
"learning_rate": 4.40714676738383e-06,
"loss": 0.5842,
"step": 843
},
{
"epoch": 0.7400668603058037,
"grad_norm": 0.3102700515568577,
"learning_rate": 4.405616362137017e-06,
"loss": 0.584,
"step": 844
},
{
"epoch": 0.7409437167753603,
"grad_norm": 0.28221217756766914,
"learning_rate": 4.404084250575952e-06,
"loss": 0.599,
"step": 845
},
{
"epoch": 0.741820573244917,
"grad_norm": 0.284085524365953,
"learning_rate": 4.4025504340725056e-06,
"loss": 0.5799,
"step": 846
},
{
"epoch": 0.7426974297144736,
"grad_norm": 0.35367792241463614,
"learning_rate": 4.401014914000078e-06,
"loss": 0.5724,
"step": 847
},
{
"epoch": 0.7435742861840302,
"grad_norm": 0.26695572041406385,
"learning_rate": 4.3994776917335945e-06,
"loss": 0.5864,
"step": 848
},
{
"epoch": 0.7444511426535869,
"grad_norm": 0.3230503614090004,
"learning_rate": 4.397938768649505e-06,
"loss": 0.5781,
"step": 849
},
{
"epoch": 0.7453279991231435,
"grad_norm": 0.32670313161244324,
"learning_rate": 4.39639814612578e-06,
"loss": 0.5921,
"step": 850
},
{
"epoch": 0.7462048555927002,
"grad_norm": 0.2965265275169285,
"learning_rate": 4.394855825541915e-06,
"loss": 0.5847,
"step": 851
},
{
"epoch": 0.7470817120622568,
"grad_norm": 0.3364787473225747,
"learning_rate": 4.393311808278924e-06,
"loss": 0.6032,
"step": 852
},
{
"epoch": 0.7479585685318134,
"grad_norm": 0.2925797984612242,
"learning_rate": 4.391766095719341e-06,
"loss": 0.5966,
"step": 853
},
{
"epoch": 0.7488354250013701,
"grad_norm": 0.36558987387215064,
"learning_rate": 4.390218689247216e-06,
"loss": 0.5965,
"step": 854
},
{
"epoch": 0.7497122814709267,
"grad_norm": 0.31214927998435166,
"learning_rate": 4.388669590248119e-06,
"loss": 0.5799,
"step": 855
},
{
"epoch": 0.7505891379404833,
"grad_norm": 0.36912682982458045,
"learning_rate": 4.387118800109133e-06,
"loss": 0.5994,
"step": 856
},
{
"epoch": 0.75146599441004,
"grad_norm": 0.33858825867324854,
"learning_rate": 4.385566320218857e-06,
"loss": 0.5894,
"step": 857
},
{
"epoch": 0.7523428508795966,
"grad_norm": 0.3095865037795698,
"learning_rate": 4.384012151967401e-06,
"loss": 0.5808,
"step": 858
},
{
"epoch": 0.7532197073491533,
"grad_norm": 0.3163720033341599,
"learning_rate": 4.382456296746389e-06,
"loss": 0.61,
"step": 859
},
{
"epoch": 0.7540965638187099,
"grad_norm": 0.30746322298068,
"learning_rate": 4.3808987559489536e-06,
"loss": 0.5901,
"step": 860
},
{
"epoch": 0.7549734202882665,
"grad_norm": 0.3216332568956709,
"learning_rate": 4.379339530969738e-06,
"loss": 0.5824,
"step": 861
},
{
"epoch": 0.7558502767578232,
"grad_norm": 0.2924396456503393,
"learning_rate": 4.377778623204894e-06,
"loss": 0.587,
"step": 862
},
{
"epoch": 0.7567271332273798,
"grad_norm": 0.3102518126275497,
"learning_rate": 4.3762160340520765e-06,
"loss": 0.5722,
"step": 863
},
{
"epoch": 0.7576039896969364,
"grad_norm": 0.29990520801248277,
"learning_rate": 4.374651764910452e-06,
"loss": 0.5867,
"step": 864
},
{
"epoch": 0.7584808461664931,
"grad_norm": 0.2742400854190758,
"learning_rate": 4.373085817180684e-06,
"loss": 0.5897,
"step": 865
},
{
"epoch": 0.7593577026360497,
"grad_norm": 0.2966143324054175,
"learning_rate": 4.371518192264946e-06,
"loss": 0.593,
"step": 866
},
{
"epoch": 0.7602345591056064,
"grad_norm": 0.2659050257990803,
"learning_rate": 4.3699488915669106e-06,
"loss": 0.5933,
"step": 867
},
{
"epoch": 0.761111415575163,
"grad_norm": 0.28333909213084835,
"learning_rate": 4.368377916491749e-06,
"loss": 0.5937,
"step": 868
},
{
"epoch": 0.7619882720447196,
"grad_norm": 0.294367790561846,
"learning_rate": 4.366805268446132e-06,
"loss": 0.5908,
"step": 869
},
{
"epoch": 0.7628651285142763,
"grad_norm": 0.2892104769841804,
"learning_rate": 4.365230948838232e-06,
"loss": 0.5749,
"step": 870
},
{
"epoch": 0.7637419849838329,
"grad_norm": 0.2992157610185369,
"learning_rate": 4.3636549590777144e-06,
"loss": 0.6038,
"step": 871
},
{
"epoch": 0.7646188414533897,
"grad_norm": 0.2849149162166013,
"learning_rate": 4.362077300575742e-06,
"loss": 0.5838,
"step": 872
},
{
"epoch": 0.7654956979229462,
"grad_norm": 0.27419838720395556,
"learning_rate": 4.360497974744971e-06,
"loss": 0.5792,
"step": 873
},
{
"epoch": 0.7663725543925028,
"grad_norm": 0.2719357502719954,
"learning_rate": 4.35891698299955e-06,
"loss": 0.5879,
"step": 874
},
{
"epoch": 0.7672494108620596,
"grad_norm": 0.29276621658420166,
"learning_rate": 4.357334326755123e-06,
"loss": 0.5903,
"step": 875
},
{
"epoch": 0.7681262673316162,
"grad_norm": 0.29234711934765684,
"learning_rate": 4.3557500074288175e-06,
"loss": 0.58,
"step": 876
},
{
"epoch": 0.7690031238011727,
"grad_norm": 0.2900743371372321,
"learning_rate": 4.354164026439256e-06,
"loss": 0.5798,
"step": 877
},
{
"epoch": 0.7698799802707295,
"grad_norm": 0.26606697197934875,
"learning_rate": 4.352576385206547e-06,
"loss": 0.6049,
"step": 878
},
{
"epoch": 0.770756836740286,
"grad_norm": 0.30681607920100556,
"learning_rate": 4.350987085152286e-06,
"loss": 0.5963,
"step": 879
},
{
"epoch": 0.7716336932098428,
"grad_norm": 0.28024451945836265,
"learning_rate": 4.349396127699552e-06,
"loss": 0.6063,
"step": 880
},
{
"epoch": 0.7725105496793994,
"grad_norm": 0.284435176139814,
"learning_rate": 4.347803514272911e-06,
"loss": 0.5847,
"step": 881
},
{
"epoch": 0.773387406148956,
"grad_norm": 0.2787875052171573,
"learning_rate": 4.34620924629841e-06,
"loss": 0.5909,
"step": 882
},
{
"epoch": 0.7742642626185127,
"grad_norm": 0.28222554386796406,
"learning_rate": 4.344613325203577e-06,
"loss": 0.5815,
"step": 883
},
{
"epoch": 0.7751411190880693,
"grad_norm": 0.30850175508825417,
"learning_rate": 4.343015752417421e-06,
"loss": 0.5761,
"step": 884
},
{
"epoch": 0.7760179755576259,
"grad_norm": 0.27711497578948074,
"learning_rate": 4.341416529370431e-06,
"loss": 0.5851,
"step": 885
},
{
"epoch": 0.7768948320271826,
"grad_norm": 0.2945928621135004,
"learning_rate": 4.339815657494571e-06,
"loss": 0.5922,
"step": 886
},
{
"epoch": 0.7777716884967392,
"grad_norm": 0.2843169638684151,
"learning_rate": 4.338213138223285e-06,
"loss": 0.5835,
"step": 887
},
{
"epoch": 0.7786485449662959,
"grad_norm": 0.2840612846899258,
"learning_rate": 4.336608972991489e-06,
"loss": 0.596,
"step": 888
},
{
"epoch": 0.7795254014358525,
"grad_norm": 0.2677194609487142,
"learning_rate": 4.335003163235574e-06,
"loss": 0.5794,
"step": 889
},
{
"epoch": 0.7804022579054091,
"grad_norm": 0.31211329913480695,
"learning_rate": 4.3333957103934025e-06,
"loss": 0.5765,
"step": 890
},
{
"epoch": 0.7812791143749658,
"grad_norm": 0.28583623636409483,
"learning_rate": 4.33178661590431e-06,
"loss": 0.6016,
"step": 891
},
{
"epoch": 0.7821559708445224,
"grad_norm": 0.31500304190137224,
"learning_rate": 4.330175881209102e-06,
"loss": 0.5877,
"step": 892
},
{
"epoch": 0.783032827314079,
"grad_norm": 0.2811796495740926,
"learning_rate": 4.32856350775005e-06,
"loss": 0.5881,
"step": 893
},
{
"epoch": 0.7839096837836357,
"grad_norm": 0.29273259848443445,
"learning_rate": 4.3269494969708954e-06,
"loss": 0.5921,
"step": 894
},
{
"epoch": 0.7847865402531923,
"grad_norm": 0.27373150864211443,
"learning_rate": 4.325333850316846e-06,
"loss": 0.6,
"step": 895
},
{
"epoch": 0.785663396722749,
"grad_norm": 0.3128309122282222,
"learning_rate": 4.323716569234572e-06,
"loss": 0.5904,
"step": 896
},
{
"epoch": 0.7865402531923056,
"grad_norm": 0.2825745062634813,
"learning_rate": 4.32209765517221e-06,
"loss": 0.5816,
"step": 897
},
{
"epoch": 0.7874171096618622,
"grad_norm": 0.3282727674741808,
"learning_rate": 4.320477109579354e-06,
"loss": 0.5882,
"step": 898
},
{
"epoch": 0.7882939661314189,
"grad_norm": 0.2940095641373108,
"learning_rate": 4.318854933907065e-06,
"loss": 0.5985,
"step": 899
},
{
"epoch": 0.7891708226009755,
"grad_norm": 0.31182474508449737,
"learning_rate": 4.317231129607859e-06,
"loss": 0.5843,
"step": 900
},
{
"epoch": 0.7900476790705322,
"grad_norm": 0.26489892008261595,
"learning_rate": 4.315605698135714e-06,
"loss": 0.591,
"step": 901
},
{
"epoch": 0.7909245355400888,
"grad_norm": 0.32933790566988397,
"learning_rate": 4.313978640946061e-06,
"loss": 0.5826,
"step": 902
},
{
"epoch": 0.7918013920096454,
"grad_norm": 0.2790564068544957,
"learning_rate": 4.312349959495791e-06,
"loss": 0.5897,
"step": 903
},
{
"epoch": 0.7926782484792021,
"grad_norm": 0.29278849432785253,
"learning_rate": 4.310719655243243e-06,
"loss": 0.5929,
"step": 904
},
{
"epoch": 0.7935551049487587,
"grad_norm": 0.2898094197798441,
"learning_rate": 4.309087729648217e-06,
"loss": 0.575,
"step": 905
},
{
"epoch": 0.7944319614183153,
"grad_norm": 0.2962974584908221,
"learning_rate": 4.30745418417196e-06,
"loss": 0.5874,
"step": 906
},
{
"epoch": 0.795308817887872,
"grad_norm": 0.2894965323690623,
"learning_rate": 4.305819020277169e-06,
"loss": 0.5769,
"step": 907
},
{
"epoch": 0.7961856743574286,
"grad_norm": 0.2744231484838131,
"learning_rate": 4.304182239427992e-06,
"loss": 0.5943,
"step": 908
},
{
"epoch": 0.7970625308269853,
"grad_norm": 0.2766245048172803,
"learning_rate": 4.302543843090026e-06,
"loss": 0.5814,
"step": 909
},
{
"epoch": 0.7979393872965419,
"grad_norm": 0.2842673020480384,
"learning_rate": 4.30090383273031e-06,
"loss": 0.5912,
"step": 910
},
{
"epoch": 0.7988162437660985,
"grad_norm": 0.28199584242917014,
"learning_rate": 4.2992622098173335e-06,
"loss": 0.5809,
"step": 911
},
{
"epoch": 0.7996931002356552,
"grad_norm": 0.2820675876804688,
"learning_rate": 4.297618975821027e-06,
"loss": 0.5917,
"step": 912
},
{
"epoch": 0.8005699567052118,
"grad_norm": 0.2728605500328137,
"learning_rate": 4.2959741322127635e-06,
"loss": 0.5764,
"step": 913
},
{
"epoch": 0.8014468131747684,
"grad_norm": 0.27169399222059704,
"learning_rate": 4.294327680465358e-06,
"loss": 0.5849,
"step": 914
},
{
"epoch": 0.8023236696443251,
"grad_norm": 0.28063665744680427,
"learning_rate": 4.292679622053066e-06,
"loss": 0.58,
"step": 915
},
{
"epoch": 0.8032005261138817,
"grad_norm": 0.25926421536726935,
"learning_rate": 4.29102995845158e-06,
"loss": 0.5787,
"step": 916
},
{
"epoch": 0.8040773825834384,
"grad_norm": 0.29001417666592577,
"learning_rate": 4.289378691138032e-06,
"loss": 0.5868,
"step": 917
},
{
"epoch": 0.804954239052995,
"grad_norm": 0.27215185007216747,
"learning_rate": 4.287725821590987e-06,
"loss": 0.5894,
"step": 918
},
{
"epoch": 0.8058310955225516,
"grad_norm": 0.3050881231274449,
"learning_rate": 4.286071351290447e-06,
"loss": 0.5911,
"step": 919
},
{
"epoch": 0.8067079519921083,
"grad_norm": 0.2873456207891206,
"learning_rate": 4.2844152817178476e-06,
"loss": 0.5835,
"step": 920
},
{
"epoch": 0.8075848084616649,
"grad_norm": 0.2626365139918821,
"learning_rate": 4.282757614356055e-06,
"loss": 0.5794,
"step": 921
},
{
"epoch": 0.8084616649312216,
"grad_norm": 0.28122583577721894,
"learning_rate": 4.281098350689367e-06,
"loss": 0.581,
"step": 922
},
{
"epoch": 0.8093385214007782,
"grad_norm": 0.2955727164056087,
"learning_rate": 4.279437492203509e-06,
"loss": 0.6024,
"step": 923
},
{
"epoch": 0.8102153778703348,
"grad_norm": 0.2928465088558078,
"learning_rate": 4.277775040385636e-06,
"loss": 0.5777,
"step": 924
},
{
"epoch": 0.8110922343398915,
"grad_norm": 0.279748286657514,
"learning_rate": 4.276110996724332e-06,
"loss": 0.5983,
"step": 925
},
{
"epoch": 0.8119690908094481,
"grad_norm": 0.3064104243975942,
"learning_rate": 4.274445362709602e-06,
"loss": 0.5959,
"step": 926
},
{
"epoch": 0.8128459472790047,
"grad_norm": 0.2705400124701495,
"learning_rate": 4.272778139832876e-06,
"loss": 0.5964,
"step": 927
},
{
"epoch": 0.8137228037485614,
"grad_norm": 0.3030828027995252,
"learning_rate": 4.271109329587009e-06,
"loss": 0.5784,
"step": 928
},
{
"epoch": 0.814599660218118,
"grad_norm": 0.2629159770264448,
"learning_rate": 4.2694389334662745e-06,
"loss": 0.5845,
"step": 929
},
{
"epoch": 0.8154765166876747,
"grad_norm": 0.3351422353981342,
"learning_rate": 4.267766952966369e-06,
"loss": 0.5949,
"step": 930
},
{
"epoch": 0.8163533731572313,
"grad_norm": 0.2760441532769009,
"learning_rate": 4.2660933895844055e-06,
"loss": 0.5904,
"step": 931
},
{
"epoch": 0.8172302296267879,
"grad_norm": 0.30558832310943446,
"learning_rate": 4.264418244818914e-06,
"loss": 0.5839,
"step": 932
},
{
"epoch": 0.8181070860963446,
"grad_norm": 0.28070458613560756,
"learning_rate": 4.262741520169844e-06,
"loss": 0.5791,
"step": 933
},
{
"epoch": 0.8189839425659012,
"grad_norm": 0.2735766456330096,
"learning_rate": 4.261063217138554e-06,
"loss": 0.5836,
"step": 934
},
{
"epoch": 0.8198607990354578,
"grad_norm": 0.3038178849716158,
"learning_rate": 4.259383337227821e-06,
"loss": 0.5885,
"step": 935
},
{
"epoch": 0.8207376555050145,
"grad_norm": 0.26590487432268695,
"learning_rate": 4.25770188194183e-06,
"loss": 0.6035,
"step": 936
},
{
"epoch": 0.8216145119745711,
"grad_norm": 0.31271672720672494,
"learning_rate": 4.25601885278618e-06,
"loss": 0.5926,
"step": 937
},
{
"epoch": 0.8224913684441278,
"grad_norm": 0.26261561071530615,
"learning_rate": 4.254334251267877e-06,
"loss": 0.5996,
"step": 938
},
{
"epoch": 0.8233682249136844,
"grad_norm": 0.2891665251939073,
"learning_rate": 4.252648078895336e-06,
"loss": 0.5876,
"step": 939
},
{
"epoch": 0.824245081383241,
"grad_norm": 0.2897735311167941,
"learning_rate": 4.2509603371783776e-06,
"loss": 0.5892,
"step": 940
},
{
"epoch": 0.8251219378527977,
"grad_norm": 0.28026024666883764,
"learning_rate": 4.249271027628228e-06,
"loss": 0.587,
"step": 941
},
{
"epoch": 0.8259987943223543,
"grad_norm": 0.2765283292737123,
"learning_rate": 4.24758015175752e-06,
"loss": 0.5769,
"step": 942
},
{
"epoch": 0.826875650791911,
"grad_norm": 0.2921232680301083,
"learning_rate": 4.245887711080283e-06,
"loss": 0.5854,
"step": 943
},
{
"epoch": 0.8277525072614677,
"grad_norm": 0.3005072830624817,
"learning_rate": 4.2441937071119524e-06,
"loss": 0.5802,
"step": 944
},
{
"epoch": 0.8286293637310242,
"grad_norm": 0.27059131939602343,
"learning_rate": 4.242498141369361e-06,
"loss": 0.5837,
"step": 945
},
{
"epoch": 0.829506220200581,
"grad_norm": 0.3038588097565146,
"learning_rate": 4.240801015370743e-06,
"loss": 0.5869,
"step": 946
},
{
"epoch": 0.8303830766701376,
"grad_norm": 0.31875741653821127,
"learning_rate": 4.239102330635726e-06,
"loss": 0.5836,
"step": 947
},
{
"epoch": 0.8312599331396942,
"grad_norm": 0.26475770270890336,
"learning_rate": 4.2374020886853354e-06,
"loss": 0.5796,
"step": 948
},
{
"epoch": 0.8321367896092509,
"grad_norm": 0.31635648581412845,
"learning_rate": 4.235700291041989e-06,
"loss": 0.5732,
"step": 949
},
{
"epoch": 0.8330136460788075,
"grad_norm": 0.27123635854757305,
"learning_rate": 4.233996939229502e-06,
"loss": 0.5977,
"step": 950
},
{
"epoch": 0.8338905025483642,
"grad_norm": 0.3356358824197267,
"learning_rate": 4.232292034773076e-06,
"loss": 0.5871,
"step": 951
},
{
"epoch": 0.8347673590179208,
"grad_norm": 0.2723531290949244,
"learning_rate": 4.230585579199306e-06,
"loss": 0.5916,
"step": 952
},
{
"epoch": 0.8356442154874774,
"grad_norm": 0.2975424730057694,
"learning_rate": 4.228877574036175e-06,
"loss": 0.592,
"step": 953
},
{
"epoch": 0.8365210719570341,
"grad_norm": 0.28108527975014536,
"learning_rate": 4.227168020813053e-06,
"loss": 0.5788,
"step": 954
},
{
"epoch": 0.8373979284265907,
"grad_norm": 0.26358656072328285,
"learning_rate": 4.225456921060698e-06,
"loss": 0.5728,
"step": 955
},
{
"epoch": 0.8382747848961473,
"grad_norm": 0.2793044648839571,
"learning_rate": 4.223744276311249e-06,
"loss": 0.5714,
"step": 956
},
{
"epoch": 0.839151641365704,
"grad_norm": 0.30214577120239683,
"learning_rate": 4.222030088098233e-06,
"loss": 0.5993,
"step": 957
},
{
"epoch": 0.8400284978352606,
"grad_norm": 0.2639515397393347,
"learning_rate": 4.220314357956557e-06,
"loss": 0.5994,
"step": 958
},
{
"epoch": 0.8409053543048173,
"grad_norm": 0.3298154347341819,
"learning_rate": 4.218597087422508e-06,
"loss": 0.5877,
"step": 959
},
{
"epoch": 0.8417822107743739,
"grad_norm": 0.28203599665081885,
"learning_rate": 4.216878278033753e-06,
"loss": 0.5865,
"step": 960
},
{
"epoch": 0.8426590672439305,
"grad_norm": 0.2746406409148874,
"learning_rate": 4.2151579313293364e-06,
"loss": 0.5881,
"step": 961
},
{
"epoch": 0.8435359237134872,
"grad_norm": 0.33875497622714734,
"learning_rate": 4.2134360488496804e-06,
"loss": 0.6029,
"step": 962
},
{
"epoch": 0.8444127801830438,
"grad_norm": 0.2875141188036911,
"learning_rate": 4.211712632136581e-06,
"loss": 0.5845,
"step": 963
},
{
"epoch": 0.8452896366526004,
"grad_norm": 0.32374197566257723,
"learning_rate": 4.209987682733207e-06,
"loss": 0.589,
"step": 964
},
{
"epoch": 0.8461664931221571,
"grad_norm": 0.26718900480287466,
"learning_rate": 4.208261202184104e-06,
"loss": 0.5844,
"step": 965
},
{
"epoch": 0.8470433495917137,
"grad_norm": 0.29759515513279916,
"learning_rate": 4.206533192035184e-06,
"loss": 0.5817,
"step": 966
},
{
"epoch": 0.8479202060612704,
"grad_norm": 0.28330165664862006,
"learning_rate": 4.20480365383373e-06,
"loss": 0.5853,
"step": 967
},
{
"epoch": 0.848797062530827,
"grad_norm": 0.26991723910735316,
"learning_rate": 4.203072589128394e-06,
"loss": 0.5847,
"step": 968
},
{
"epoch": 0.8496739190003836,
"grad_norm": 0.28120405866784015,
"learning_rate": 4.201339999469194e-06,
"loss": 0.5771,
"step": 969
},
{
"epoch": 0.8505507754699403,
"grad_norm": 0.29731566030764794,
"learning_rate": 4.199605886407515e-06,
"loss": 0.5872,
"step": 970
},
{
"epoch": 0.8514276319394969,
"grad_norm": 0.29823098898704575,
"learning_rate": 4.197870251496104e-06,
"loss": 0.585,
"step": 971
},
{
"epoch": 0.8523044884090536,
"grad_norm": 0.29246400163730035,
"learning_rate": 4.196133096289071e-06,
"loss": 0.5728,
"step": 972
},
{
"epoch": 0.8531813448786102,
"grad_norm": 0.31038345035918974,
"learning_rate": 4.194394422341888e-06,
"loss": 0.588,
"step": 973
},
{
"epoch": 0.8540582013481668,
"grad_norm": 0.29419655403066824,
"learning_rate": 4.192654231211389e-06,
"loss": 0.5802,
"step": 974
},
{
"epoch": 0.8549350578177235,
"grad_norm": 0.28924212129082133,
"learning_rate": 4.190912524455762e-06,
"loss": 0.5957,
"step": 975
},
{
"epoch": 0.8558119142872801,
"grad_norm": 0.3433724407789192,
"learning_rate": 4.189169303634555e-06,
"loss": 0.5943,
"step": 976
},
{
"epoch": 0.8566887707568367,
"grad_norm": 0.3447246872111939,
"learning_rate": 4.187424570308671e-06,
"loss": 0.5679,
"step": 977
},
{
"epoch": 0.8575656272263934,
"grad_norm": 0.2717297839127488,
"learning_rate": 4.185678326040369e-06,
"loss": 0.5839,
"step": 978
},
{
"epoch": 0.85844248369595,
"grad_norm": 0.3149777108439808,
"learning_rate": 4.1839305723932565e-06,
"loss": 0.5684,
"step": 979
},
{
"epoch": 0.8593193401655067,
"grad_norm": 0.3196280126814673,
"learning_rate": 4.1821813109322975e-06,
"loss": 0.5845,
"step": 980
},
{
"epoch": 0.8601961966350633,
"grad_norm": 0.3166850113740036,
"learning_rate": 4.180430543223803e-06,
"loss": 0.5722,
"step": 981
},
{
"epoch": 0.8610730531046199,
"grad_norm": 0.30727325041845543,
"learning_rate": 4.178678270835435e-06,
"loss": 0.582,
"step": 982
},
{
"epoch": 0.8619499095741766,
"grad_norm": 0.34738075452538025,
"learning_rate": 4.1769244953361995e-06,
"loss": 0.5789,
"step": 983
},
{
"epoch": 0.8628267660437332,
"grad_norm": 0.3029018585056203,
"learning_rate": 4.1751692182964524e-06,
"loss": 0.5906,
"step": 984
},
{
"epoch": 0.8637036225132898,
"grad_norm": 0.27172806950560857,
"learning_rate": 4.1734124412878915e-06,
"loss": 0.5864,
"step": 985
},
{
"epoch": 0.8645804789828465,
"grad_norm": 0.3078626255245488,
"learning_rate": 4.171654165883558e-06,
"loss": 0.5961,
"step": 986
},
{
"epoch": 0.8654573354524031,
"grad_norm": 0.28755523271585887,
"learning_rate": 4.169894393657834e-06,
"loss": 0.5881,
"step": 987
},
{
"epoch": 0.8663341919219598,
"grad_norm": 0.3081436303822685,
"learning_rate": 4.168133126186445e-06,
"loss": 0.5818,
"step": 988
},
{
"epoch": 0.8672110483915164,
"grad_norm": 0.2785218381541765,
"learning_rate": 4.166370365046452e-06,
"loss": 0.5828,
"step": 989
},
{
"epoch": 0.868087904861073,
"grad_norm": 0.3391784184001714,
"learning_rate": 4.164606111816256e-06,
"loss": 0.5867,
"step": 990
},
{
"epoch": 0.8689647613306297,
"grad_norm": 0.27636992919331915,
"learning_rate": 4.162840368075591e-06,
"loss": 0.599,
"step": 991
},
{
"epoch": 0.8698416178001863,
"grad_norm": 0.28517927301055196,
"learning_rate": 4.161073135405529e-06,
"loss": 0.5831,
"step": 992
},
{
"epoch": 0.870718474269743,
"grad_norm": 0.29490820494014364,
"learning_rate": 4.1593044153884745e-06,
"loss": 0.5757,
"step": 993
},
{
"epoch": 0.8715953307392996,
"grad_norm": 0.2780476402469785,
"learning_rate": 4.157534209608161e-06,
"loss": 0.5964,
"step": 994
},
{
"epoch": 0.8724721872088562,
"grad_norm": 0.29068689725516644,
"learning_rate": 4.155762519649654e-06,
"loss": 0.5805,
"step": 995
},
{
"epoch": 0.8733490436784129,
"grad_norm": 0.26095614944942314,
"learning_rate": 4.15398934709935e-06,
"loss": 0.5841,
"step": 996
},
{
"epoch": 0.8742259001479695,
"grad_norm": 0.31389428529448765,
"learning_rate": 4.1522146935449705e-06,
"loss": 0.5846,
"step": 997
},
{
"epoch": 0.8751027566175261,
"grad_norm": 0.26816106638671405,
"learning_rate": 4.150438560575563e-06,
"loss": 0.5833,
"step": 998
},
{
"epoch": 0.8759796130870828,
"grad_norm": 0.31604277041792156,
"learning_rate": 4.1486609497815025e-06,
"loss": 0.5888,
"step": 999
},
{
"epoch": 0.8768564695566394,
"grad_norm": 0.3606037237047822,
"learning_rate": 4.146881862754485e-06,
"loss": 0.5942,
"step": 1000
},
{
"epoch": 0.8777333260261961,
"grad_norm": 0.28543513756367406,
"learning_rate": 4.145101301087527e-06,
"loss": 0.5915,
"step": 1001
},
{
"epoch": 0.8786101824957527,
"grad_norm": 0.3462271962536017,
"learning_rate": 4.143319266374969e-06,
"loss": 0.5942,
"step": 1002
},
{
"epoch": 0.8794870389653093,
"grad_norm": 0.2833352289445499,
"learning_rate": 4.141535760212467e-06,
"loss": 0.5863,
"step": 1003
},
{
"epoch": 0.880363895434866,
"grad_norm": 0.35489814354695126,
"learning_rate": 4.139750784196998e-06,
"loss": 0.5924,
"step": 1004
},
{
"epoch": 0.8812407519044226,
"grad_norm": 0.2942335535458572,
"learning_rate": 4.137964339926852e-06,
"loss": 0.5892,
"step": 1005
},
{
"epoch": 0.8821176083739792,
"grad_norm": 0.32828822885224784,
"learning_rate": 4.136176429001634e-06,
"loss": 0.5909,
"step": 1006
},
{
"epoch": 0.8829944648435359,
"grad_norm": 0.3123727759868493,
"learning_rate": 4.134387053022266e-06,
"loss": 0.5845,
"step": 1007
},
{
"epoch": 0.8838713213130925,
"grad_norm": 0.2862421766790686,
"learning_rate": 4.132596213590977e-06,
"loss": 0.5848,
"step": 1008
},
{
"epoch": 0.8847481777826492,
"grad_norm": 0.32232750817039807,
"learning_rate": 4.1308039123113084e-06,
"loss": 0.5869,
"step": 1009
},
{
"epoch": 0.8856250342522058,
"grad_norm": 0.28776404090006724,
"learning_rate": 4.129010150788112e-06,
"loss": 0.5992,
"step": 1010
},
{
"epoch": 0.8865018907217624,
"grad_norm": 0.3257967217812331,
"learning_rate": 4.127214930627545e-06,
"loss": 0.5828,
"step": 1011
},
{
"epoch": 0.8873787471913192,
"grad_norm": 0.3065300730664574,
"learning_rate": 4.125418253437071e-06,
"loss": 0.578,
"step": 1012
},
{
"epoch": 0.8882556036608757,
"grad_norm": 0.29218143100925903,
"learning_rate": 4.123620120825459e-06,
"loss": 0.5939,
"step": 1013
},
{
"epoch": 0.8891324601304323,
"grad_norm": 0.28565794045128473,
"learning_rate": 4.121820534402781e-06,
"loss": 0.5868,
"step": 1014
},
{
"epoch": 0.890009316599989,
"grad_norm": 0.30898296228273797,
"learning_rate": 4.120019495780412e-06,
"loss": 0.582,
"step": 1015
},
{
"epoch": 0.8908861730695457,
"grad_norm": 0.2911662733325922,
"learning_rate": 4.118217006571023e-06,
"loss": 0.5923,
"step": 1016
},
{
"epoch": 0.8917630295391024,
"grad_norm": 0.2843342810887561,
"learning_rate": 4.116413068388589e-06,
"loss": 0.5754,
"step": 1017
},
{
"epoch": 0.892639886008659,
"grad_norm": 0.334401955522752,
"learning_rate": 4.11460768284838e-06,
"loss": 0.5895,
"step": 1018
},
{
"epoch": 0.8935167424782156,
"grad_norm": 0.2600873368987441,
"learning_rate": 4.11280085156696e-06,
"loss": 0.5858,
"step": 1019
},
{
"epoch": 0.8943935989477723,
"grad_norm": 0.3051388251322737,
"learning_rate": 4.110992576162193e-06,
"loss": 0.5861,
"step": 1020
},
{
"epoch": 0.8952704554173289,
"grad_norm": 0.30230682759222505,
"learning_rate": 4.109182858253231e-06,
"loss": 0.5857,
"step": 1021
},
{
"epoch": 0.8961473118868856,
"grad_norm": 0.27145584987414345,
"learning_rate": 4.107371699460521e-06,
"loss": 0.5827,
"step": 1022
},
{
"epoch": 0.8970241683564422,
"grad_norm": 0.2886096599363367,
"learning_rate": 4.1055591014057964e-06,
"loss": 0.5732,
"step": 1023
},
{
"epoch": 0.8979010248259988,
"grad_norm": 0.2643618798342576,
"learning_rate": 4.103745065712083e-06,
"loss": 0.581,
"step": 1024
},
{
"epoch": 0.8987778812955555,
"grad_norm": 0.27612674007258925,
"learning_rate": 4.101929594003694e-06,
"loss": 0.5774,
"step": 1025
},
{
"epoch": 0.8996547377651121,
"grad_norm": 0.2694404941538916,
"learning_rate": 4.100112687906224e-06,
"loss": 0.5792,
"step": 1026
},
{
"epoch": 0.9005315942346687,
"grad_norm": 0.26812897420311116,
"learning_rate": 4.098294349046556e-06,
"loss": 0.5945,
"step": 1027
},
{
"epoch": 0.9014084507042254,
"grad_norm": 0.2744007605554886,
"learning_rate": 4.0964745790528564e-06,
"loss": 0.5712,
"step": 1028
},
{
"epoch": 0.902285307173782,
"grad_norm": 0.2614641549143825,
"learning_rate": 4.09465337955457e-06,
"loss": 0.5756,
"step": 1029
},
{
"epoch": 0.9031621636433387,
"grad_norm": 0.25643605179903173,
"learning_rate": 4.092830752182423e-06,
"loss": 0.593,
"step": 1030
},
{
"epoch": 0.9040390201128953,
"grad_norm": 0.26698048225450505,
"learning_rate": 4.091006698568419e-06,
"loss": 0.5877,
"step": 1031
},
{
"epoch": 0.9049158765824519,
"grad_norm": 0.2655671129093472,
"learning_rate": 4.0891812203458425e-06,
"loss": 0.5701,
"step": 1032
},
{
"epoch": 0.9057927330520086,
"grad_norm": 0.2706223562384906,
"learning_rate": 4.08735431914925e-06,
"loss": 0.5818,
"step": 1033
},
{
"epoch": 0.9066695895215652,
"grad_norm": 0.26684323937974636,
"learning_rate": 4.085525996614472e-06,
"loss": 0.5878,
"step": 1034
},
{
"epoch": 0.9075464459911218,
"grad_norm": 0.24564951471442678,
"learning_rate": 4.083696254378615e-06,
"loss": 0.5967,
"step": 1035
},
{
"epoch": 0.9084233024606785,
"grad_norm": 0.2761933648093443,
"learning_rate": 4.081865094080053e-06,
"loss": 0.576,
"step": 1036
},
{
"epoch": 0.9093001589302351,
"grad_norm": 0.2722027493749199,
"learning_rate": 4.080032517358431e-06,
"loss": 0.579,
"step": 1037
},
{
"epoch": 0.9101770153997918,
"grad_norm": 0.5039307385586534,
"learning_rate": 4.078198525854664e-06,
"loss": 0.5943,
"step": 1038
},
{
"epoch": 0.9110538718693484,
"grad_norm": 0.26519176650439175,
"learning_rate": 4.0763631212109315e-06,
"loss": 0.5893,
"step": 1039
},
{
"epoch": 0.911930728338905,
"grad_norm": 0.2644411261920598,
"learning_rate": 4.074526305070679e-06,
"loss": 0.5791,
"step": 1040
},
{
"epoch": 0.9128075848084617,
"grad_norm": 0.27917354228958563,
"learning_rate": 4.072688079078616e-06,
"loss": 0.5847,
"step": 1041
},
{
"epoch": 0.9136844412780183,
"grad_norm": 0.27274252297201695,
"learning_rate": 4.070848444880716e-06,
"loss": 0.5695,
"step": 1042
},
{
"epoch": 0.914561297747575,
"grad_norm": 0.26541238057197397,
"learning_rate": 4.06900740412421e-06,
"loss": 0.5858,
"step": 1043
},
{
"epoch": 0.9154381542171316,
"grad_norm": 0.2687466193673103,
"learning_rate": 4.0671649584575925e-06,
"loss": 0.5832,
"step": 1044
},
{
"epoch": 0.9163150106866882,
"grad_norm": 0.27584447196087264,
"learning_rate": 4.065321109530612e-06,
"loss": 0.5828,
"step": 1045
},
{
"epoch": 0.9171918671562449,
"grad_norm": 0.27618254494046185,
"learning_rate": 4.063475858994276e-06,
"loss": 0.5829,
"step": 1046
},
{
"epoch": 0.9180687236258015,
"grad_norm": 0.2800627797716068,
"learning_rate": 4.061629208500847e-06,
"loss": 0.5813,
"step": 1047
},
{
"epoch": 0.9189455800953581,
"grad_norm": 0.2731973027581407,
"learning_rate": 4.059781159703839e-06,
"loss": 0.5907,
"step": 1048
},
{
"epoch": 0.9198224365649148,
"grad_norm": 0.2817329916742434,
"learning_rate": 4.057931714258022e-06,
"loss": 0.5845,
"step": 1049
},
{
"epoch": 0.9206992930344714,
"grad_norm": 0.2624010665247189,
"learning_rate": 4.056080873819412e-06,
"loss": 0.579,
"step": 1050
},
{
"epoch": 0.9215761495040281,
"grad_norm": 0.26121937584936983,
"learning_rate": 4.054228640045275e-06,
"loss": 0.5857,
"step": 1051
},
{
"epoch": 0.9224530059735847,
"grad_norm": 0.2832895486337394,
"learning_rate": 4.052375014594129e-06,
"loss": 0.5957,
"step": 1052
},
{
"epoch": 0.9233298624431413,
"grad_norm": 0.27671228904328893,
"learning_rate": 4.0505199991257325e-06,
"loss": 0.5791,
"step": 1053
},
{
"epoch": 0.924206718912698,
"grad_norm": 0.266998502123574,
"learning_rate": 4.048663595301093e-06,
"loss": 0.5896,
"step": 1054
},
{
"epoch": 0.9250835753822546,
"grad_norm": 0.3094016546060802,
"learning_rate": 4.046805804782456e-06,
"loss": 0.5788,
"step": 1055
},
{
"epoch": 0.9259604318518112,
"grad_norm": 0.2782662002801493,
"learning_rate": 4.0449466292333166e-06,
"loss": 0.5888,
"step": 1056
},
{
"epoch": 0.9268372883213679,
"grad_norm": 0.27821869081922773,
"learning_rate": 4.043086070318401e-06,
"loss": 0.5879,
"step": 1057
},
{
"epoch": 0.9277141447909245,
"grad_norm": 0.32143887759720546,
"learning_rate": 4.04122412970368e-06,
"loss": 0.5884,
"step": 1058
},
{
"epoch": 0.9285910012604812,
"grad_norm": 0.2598221780539352,
"learning_rate": 4.039360809056361e-06,
"loss": 0.58,
"step": 1059
},
{
"epoch": 0.9294678577300378,
"grad_norm": 0.3300275262996093,
"learning_rate": 4.037496110044885e-06,
"loss": 0.5963,
"step": 1060
},
{
"epoch": 0.9303447141995944,
"grad_norm": 0.2723517740568475,
"learning_rate": 4.035630034338928e-06,
"loss": 0.5684,
"step": 1061
},
{
"epoch": 0.9312215706691511,
"grad_norm": 0.26174388908838997,
"learning_rate": 4.033762583609398e-06,
"loss": 0.5741,
"step": 1062
},
{
"epoch": 0.9320984271387077,
"grad_norm": 0.2879705808043353,
"learning_rate": 4.031893759528439e-06,
"loss": 0.5651,
"step": 1063
},
{
"epoch": 0.9329752836082644,
"grad_norm": 0.27573911638107307,
"learning_rate": 4.030023563769418e-06,
"loss": 0.5738,
"step": 1064
},
{
"epoch": 0.933852140077821,
"grad_norm": 0.270890009890323,
"learning_rate": 4.028151998006934e-06,
"loss": 0.5748,
"step": 1065
},
{
"epoch": 0.9347289965473776,
"grad_norm": 0.2651359065699047,
"learning_rate": 4.026279063916811e-06,
"loss": 0.5815,
"step": 1066
},
{
"epoch": 0.9356058530169343,
"grad_norm": 0.285792627094006,
"learning_rate": 4.024404763176101e-06,
"loss": 0.5714,
"step": 1067
},
{
"epoch": 0.9364827094864909,
"grad_norm": 0.25220096965602506,
"learning_rate": 4.022529097463076e-06,
"loss": 0.5761,
"step": 1068
},
{
"epoch": 0.9373595659560475,
"grad_norm": 0.2572736434059626,
"learning_rate": 4.020652068457234e-06,
"loss": 0.5813,
"step": 1069
},
{
"epoch": 0.9382364224256042,
"grad_norm": 0.2769717174034421,
"learning_rate": 4.018773677839289e-06,
"loss": 0.5902,
"step": 1070
},
{
"epoch": 0.9391132788951608,
"grad_norm": 0.2638965107730823,
"learning_rate": 4.016893927291179e-06,
"loss": 0.5774,
"step": 1071
},
{
"epoch": 0.9399901353647175,
"grad_norm": 0.26364544697361064,
"learning_rate": 4.015012818496057e-06,
"loss": 0.5885,
"step": 1072
},
{
"epoch": 0.9408669918342741,
"grad_norm": 0.2782490552191973,
"learning_rate": 4.013130353138293e-06,
"loss": 0.5734,
"step": 1073
},
{
"epoch": 0.9417438483038307,
"grad_norm": 0.2939309170345373,
"learning_rate": 4.011246532903472e-06,
"loss": 0.5863,
"step": 1074
},
{
"epoch": 0.9426207047733874,
"grad_norm": 0.27682818038097917,
"learning_rate": 4.00936135947839e-06,
"loss": 0.5878,
"step": 1075
},
{
"epoch": 0.943497561242944,
"grad_norm": 0.27100650217384786,
"learning_rate": 4.007474834551059e-06,
"loss": 0.5788,
"step": 1076
},
{
"epoch": 0.9443744177125006,
"grad_norm": 0.3179264915740243,
"learning_rate": 4.005586959810697e-06,
"loss": 0.5697,
"step": 1077
},
{
"epoch": 0.9452512741820573,
"grad_norm": 0.26927348365153236,
"learning_rate": 4.003697736947731e-06,
"loss": 0.5683,
"step": 1078
},
{
"epoch": 0.9461281306516139,
"grad_norm": 0.2755764124341007,
"learning_rate": 4.001807167653798e-06,
"loss": 0.5794,
"step": 1079
},
{
"epoch": 0.9470049871211706,
"grad_norm": 0.2908090312996085,
"learning_rate": 3.999915253621739e-06,
"loss": 0.586,
"step": 1080
},
{
"epoch": 0.9478818435907272,
"grad_norm": 0.2545666408606057,
"learning_rate": 3.998021996545599e-06,
"loss": 0.5831,
"step": 1081
},
{
"epoch": 0.9487587000602838,
"grad_norm": 0.29377943743323887,
"learning_rate": 3.9961273981206245e-06,
"loss": 0.585,
"step": 1082
},
{
"epoch": 0.9496355565298406,
"grad_norm": 0.26968750170325856,
"learning_rate": 3.994231460043265e-06,
"loss": 0.5782,
"step": 1083
},
{
"epoch": 0.9505124129993971,
"grad_norm": 0.2911018694543167,
"learning_rate": 3.9923341840111675e-06,
"loss": 0.5813,
"step": 1084
},
{
"epoch": 0.9513892694689537,
"grad_norm": 0.32080813736390973,
"learning_rate": 3.99043557172318e-06,
"loss": 0.5836,
"step": 1085
},
{
"epoch": 0.9522661259385105,
"grad_norm": 0.2894185491332872,
"learning_rate": 3.988535624879344e-06,
"loss": 0.583,
"step": 1086
},
{
"epoch": 0.953142982408067,
"grad_norm": 0.3036439907360394,
"learning_rate": 3.986634345180899e-06,
"loss": 0.5753,
"step": 1087
},
{
"epoch": 0.9540198388776238,
"grad_norm": 0.30256015219807453,
"learning_rate": 3.984731734330273e-06,
"loss": 0.5787,
"step": 1088
},
{
"epoch": 0.9548966953471804,
"grad_norm": 0.2684694121785645,
"learning_rate": 3.982827794031091e-06,
"loss": 0.5811,
"step": 1089
},
{
"epoch": 0.955773551816737,
"grad_norm": 0.3047268297869491,
"learning_rate": 3.980922525988167e-06,
"loss": 0.5757,
"step": 1090
},
{
"epoch": 0.9566504082862937,
"grad_norm": 0.2680829692432763,
"learning_rate": 3.979015931907502e-06,
"loss": 0.5938,
"step": 1091
},
{
"epoch": 0.9575272647558503,
"grad_norm": 0.28352806229638294,
"learning_rate": 3.977108013496286e-06,
"loss": 0.5648,
"step": 1092
},
{
"epoch": 0.958404121225407,
"grad_norm": 0.27134893274934896,
"learning_rate": 3.975198772462896e-06,
"loss": 0.5959,
"step": 1093
},
{
"epoch": 0.9592809776949636,
"grad_norm": 0.27670636726963027,
"learning_rate": 3.973288210516889e-06,
"loss": 0.5825,
"step": 1094
},
{
"epoch": 0.9601578341645202,
"grad_norm": 0.27577855913411087,
"learning_rate": 3.971376329369011e-06,
"loss": 0.5763,
"step": 1095
},
{
"epoch": 0.9610346906340769,
"grad_norm": 0.2613562238768912,
"learning_rate": 3.969463130731183e-06,
"loss": 0.587,
"step": 1096
},
{
"epoch": 0.9619115471036335,
"grad_norm": 0.30682832359084977,
"learning_rate": 3.96754861631651e-06,
"loss": 0.6012,
"step": 1097
},
{
"epoch": 0.9627884035731901,
"grad_norm": 0.2753727317824162,
"learning_rate": 3.965632787839274e-06,
"loss": 0.593,
"step": 1098
},
{
"epoch": 0.9636652600427468,
"grad_norm": 0.2896526629743159,
"learning_rate": 3.963715647014932e-06,
"loss": 0.5823,
"step": 1099
},
{
"epoch": 0.9645421165123034,
"grad_norm": 0.28810606366408137,
"learning_rate": 3.961797195560118e-06,
"loss": 0.5844,
"step": 1100
},
{
"epoch": 0.9654189729818601,
"grad_norm": 0.2603559754869869,
"learning_rate": 3.959877435192639e-06,
"loss": 0.5803,
"step": 1101
},
{
"epoch": 0.9662958294514167,
"grad_norm": 0.28655269690518276,
"learning_rate": 3.957956367631475e-06,
"loss": 0.5707,
"step": 1102
},
{
"epoch": 0.9671726859209733,
"grad_norm": 0.3009451530592475,
"learning_rate": 3.956033994596773e-06,
"loss": 0.5771,
"step": 1103
},
{
"epoch": 0.96804954239053,
"grad_norm": 0.2577540703327921,
"learning_rate": 3.954110317809854e-06,
"loss": 0.576,
"step": 1104
},
{
"epoch": 0.9689263988600866,
"grad_norm": 0.29870257898995317,
"learning_rate": 3.952185338993202e-06,
"loss": 0.5872,
"step": 1105
},
{
"epoch": 0.9698032553296432,
"grad_norm": 0.2768702174324288,
"learning_rate": 3.95025905987047e-06,
"loss": 0.5831,
"step": 1106
},
{
"epoch": 0.9706801117991999,
"grad_norm": 0.288774627238478,
"learning_rate": 3.948331482166473e-06,
"loss": 0.5951,
"step": 1107
},
{
"epoch": 0.9715569682687565,
"grad_norm": 0.324678524263679,
"learning_rate": 3.94640260760719e-06,
"loss": 0.5734,
"step": 1108
},
{
"epoch": 0.9724338247383132,
"grad_norm": 0.2777093036856744,
"learning_rate": 3.944472437919761e-06,
"loss": 0.5846,
"step": 1109
},
{
"epoch": 0.9733106812078698,
"grad_norm": 0.337073965677139,
"learning_rate": 3.942540974832486e-06,
"loss": 0.5904,
"step": 1110
},
{
"epoch": 0.9741875376774264,
"grad_norm": 0.2919504390486104,
"learning_rate": 3.9406082200748216e-06,
"loss": 0.5901,
"step": 1111
},
{
"epoch": 0.9750643941469831,
"grad_norm": 0.26917415244282195,
"learning_rate": 3.938674175377383e-06,
"loss": 0.5727,
"step": 1112
},
{
"epoch": 0.9759412506165397,
"grad_norm": 0.2968354712585106,
"learning_rate": 3.93673884247194e-06,
"loss": 0.5684,
"step": 1113
},
{
"epoch": 0.9768181070860964,
"grad_norm": 0.26666333819741744,
"learning_rate": 3.934802223091415e-06,
"loss": 0.582,
"step": 1114
},
{
"epoch": 0.977694963555653,
"grad_norm": 0.2648009228041306,
"learning_rate": 3.932864318969882e-06,
"loss": 0.5732,
"step": 1115
},
{
"epoch": 0.9785718200252096,
"grad_norm": 0.26447715765911384,
"learning_rate": 3.930925131842567e-06,
"loss": 0.581,
"step": 1116
},
{
"epoch": 0.9794486764947663,
"grad_norm": 0.26650421292261106,
"learning_rate": 3.928984663445844e-06,
"loss": 0.578,
"step": 1117
},
{
"epoch": 0.9803255329643229,
"grad_norm": 0.27399427740484344,
"learning_rate": 3.927042915517234e-06,
"loss": 0.5841,
"step": 1118
},
{
"epoch": 0.9812023894338795,
"grad_norm": 0.29486187077568676,
"learning_rate": 3.925099889795404e-06,
"loss": 0.5791,
"step": 1119
},
{
"epoch": 0.9820792459034362,
"grad_norm": 0.27626862187200796,
"learning_rate": 3.9231555880201655e-06,
"loss": 0.5758,
"step": 1120
},
{
"epoch": 0.9829561023729928,
"grad_norm": 0.2709394700881976,
"learning_rate": 3.9212100119324704e-06,
"loss": 0.5725,
"step": 1121
},
{
"epoch": 0.9838329588425495,
"grad_norm": 0.257787971984586,
"learning_rate": 3.919263163274416e-06,
"loss": 0.5733,
"step": 1122
},
{
"epoch": 0.9847098153121061,
"grad_norm": 0.2854496376494655,
"learning_rate": 3.917315043789235e-06,
"loss": 0.5696,
"step": 1123
},
{
"epoch": 0.9855866717816627,
"grad_norm": 0.2566199610678738,
"learning_rate": 3.9153656552212995e-06,
"loss": 0.5813,
"step": 1124
},
{
"epoch": 0.9864635282512194,
"grad_norm": 0.2555880030988225,
"learning_rate": 3.913414999316118e-06,
"loss": 0.5945,
"step": 1125
},
{
"epoch": 0.987340384720776,
"grad_norm": 0.2577195559469773,
"learning_rate": 3.911463077820336e-06,
"loss": 0.5675,
"step": 1126
},
{
"epoch": 0.9882172411903326,
"grad_norm": 0.26851748898394834,
"learning_rate": 3.909509892481726e-06,
"loss": 0.5807,
"step": 1127
},
{
"epoch": 0.9890940976598893,
"grad_norm": 0.2617539578196299,
"learning_rate": 3.907555445049198e-06,
"loss": 0.5684,
"step": 1128
},
{
"epoch": 0.9899709541294459,
"grad_norm": 0.2586839170532308,
"learning_rate": 3.905599737272791e-06,
"loss": 0.5801,
"step": 1129
},
{
"epoch": 0.9908478105990026,
"grad_norm": 0.25049955800874396,
"learning_rate": 3.903642770903671e-06,
"loss": 0.5762,
"step": 1130
},
{
"epoch": 0.9917246670685592,
"grad_norm": 0.27270516361418773,
"learning_rate": 3.901684547694133e-06,
"loss": 0.5878,
"step": 1131
},
{
"epoch": 0.9926015235381158,
"grad_norm": 0.2816673997379789,
"learning_rate": 3.899725069397593e-06,
"loss": 0.5927,
"step": 1132
},
{
"epoch": 0.9934783800076725,
"grad_norm": 0.2679288547921494,
"learning_rate": 3.897764337768597e-06,
"loss": 0.5772,
"step": 1133
},
{
"epoch": 0.9943552364772291,
"grad_norm": 0.27040765991438753,
"learning_rate": 3.895802354562808e-06,
"loss": 0.5623,
"step": 1134
},
{
"epoch": 0.9952320929467857,
"grad_norm": 0.29605913619532825,
"learning_rate": 3.893839121537015e-06,
"loss": 0.5868,
"step": 1135
},
{
"epoch": 0.9961089494163424,
"grad_norm": 0.27461413478738583,
"learning_rate": 3.89187464044912e-06,
"loss": 0.5871,
"step": 1136
},
{
"epoch": 0.996985805885899,
"grad_norm": 0.28648748056684925,
"learning_rate": 3.8899089130581465e-06,
"loss": 0.5753,
"step": 1137
},
{
"epoch": 0.9978626623554557,
"grad_norm": 0.2925165297373746,
"learning_rate": 3.8879419411242335e-06,
"loss": 0.5828,
"step": 1138
},
{
"epoch": 0.9987395188250123,
"grad_norm": 0.29352029461564516,
"learning_rate": 3.885973726408634e-06,
"loss": 0.5842,
"step": 1139
},
{
"epoch": 0.9996163752945689,
"grad_norm": 0.28650442615475913,
"learning_rate": 3.884004270673711e-06,
"loss": 0.5803,
"step": 1140
}
],
"logging_steps": 1,
"max_steps": 3420,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1140,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3818875539947520.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}