FatCat87's picture
Upload folder using huggingface_hub
d01c286 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9968051118210862,
"eval_steps": 59,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004259850905218318,
"grad_norm": 0.42660918831825256,
"learning_rate": 2e-05,
"loss": 1.4194,
"step": 1
},
{
"epoch": 0.004259850905218318,
"eval_loss": 1.3981385231018066,
"eval_runtime": 17.5749,
"eval_samples_per_second": 22.475,
"eval_steps_per_second": 2.845,
"step": 1
},
{
"epoch": 0.008519701810436636,
"grad_norm": 0.38132771849632263,
"learning_rate": 4e-05,
"loss": 1.4291,
"step": 2
},
{
"epoch": 0.012779552715654952,
"grad_norm": 0.4677501916885376,
"learning_rate": 6e-05,
"loss": 1.606,
"step": 3
},
{
"epoch": 0.01703940362087327,
"grad_norm": 0.4839603900909424,
"learning_rate": 8e-05,
"loss": 1.5193,
"step": 4
},
{
"epoch": 0.021299254526091587,
"grad_norm": 0.52900630235672,
"learning_rate": 0.0001,
"loss": 1.7253,
"step": 5
},
{
"epoch": 0.025559105431309903,
"grad_norm": 0.4611320495605469,
"learning_rate": 0.00012,
"loss": 1.4042,
"step": 6
},
{
"epoch": 0.029818956336528223,
"grad_norm": 0.5078997611999512,
"learning_rate": 0.00014,
"loss": 1.8641,
"step": 7
},
{
"epoch": 0.03407880724174654,
"grad_norm": 0.5692968368530273,
"learning_rate": 0.00016,
"loss": 1.0603,
"step": 8
},
{
"epoch": 0.038338658146964855,
"grad_norm": 0.5424911379814148,
"learning_rate": 0.00018,
"loss": 0.9217,
"step": 9
},
{
"epoch": 0.042598509052183174,
"grad_norm": 0.6595712304115295,
"learning_rate": 0.0002,
"loss": 1.0443,
"step": 10
},
{
"epoch": 0.046858359957401494,
"grad_norm": 0.552948534488678,
"learning_rate": 0.00019999016517595753,
"loss": 0.9727,
"step": 11
},
{
"epoch": 0.051118210862619806,
"grad_norm": 0.523713231086731,
"learning_rate": 0.00019996066263830531,
"loss": 1.0042,
"step": 12
},
{
"epoch": 0.055378061767838126,
"grad_norm": 0.3326718807220459,
"learning_rate": 0.0001999114981900887,
"loss": 0.6851,
"step": 13
},
{
"epoch": 0.059637912673056445,
"grad_norm": 0.40246546268463135,
"learning_rate": 0.00019984268150178167,
"loss": 0.6865,
"step": 14
},
{
"epoch": 0.06389776357827476,
"grad_norm": 0.3299888074398041,
"learning_rate": 0.00019975422610938462,
"loss": 0.6413,
"step": 15
},
{
"epoch": 0.06815761448349308,
"grad_norm": 0.321532666683197,
"learning_rate": 0.00019964614941176195,
"loss": 0.6425,
"step": 16
},
{
"epoch": 0.0724174653887114,
"grad_norm": 0.30551549792289734,
"learning_rate": 0.0001995184726672197,
"loss": 0.6573,
"step": 17
},
{
"epoch": 0.07667731629392971,
"grad_norm": 0.3162730932235718,
"learning_rate": 0.00019937122098932428,
"loss": 0.7957,
"step": 18
},
{
"epoch": 0.08093716719914804,
"grad_norm": 0.2646523714065552,
"learning_rate": 0.00019920442334196248,
"loss": 0.6842,
"step": 19
},
{
"epoch": 0.08519701810436635,
"grad_norm": 0.35260164737701416,
"learning_rate": 0.00019901811253364456,
"loss": 0.7045,
"step": 20
},
{
"epoch": 0.08945686900958466,
"grad_norm": 0.36994901299476624,
"learning_rate": 0.00019881232521105089,
"loss": 0.7506,
"step": 21
},
{
"epoch": 0.09371671991480299,
"grad_norm": 0.3145638108253479,
"learning_rate": 0.0001985871018518236,
"loss": 0.6522,
"step": 22
},
{
"epoch": 0.0979765708200213,
"grad_norm": 0.28740495443344116,
"learning_rate": 0.00019834248675660486,
"loss": 0.5763,
"step": 23
},
{
"epoch": 0.10223642172523961,
"grad_norm": 0.29527685046195984,
"learning_rate": 0.00019807852804032305,
"loss": 0.8533,
"step": 24
},
{
"epoch": 0.10649627263045794,
"grad_norm": 0.3023378849029541,
"learning_rate": 0.00019779527762272877,
"loss": 0.738,
"step": 25
},
{
"epoch": 0.11075612353567625,
"grad_norm": 0.2749658524990082,
"learning_rate": 0.00019749279121818235,
"loss": 0.6354,
"step": 26
},
{
"epoch": 0.11501597444089456,
"grad_norm": 0.3914307951927185,
"learning_rate": 0.0001971711283246951,
"loss": 0.8604,
"step": 27
},
{
"epoch": 0.11927582534611289,
"grad_norm": 0.47873714566230774,
"learning_rate": 0.00019683035221222618,
"loss": 0.7972,
"step": 28
},
{
"epoch": 0.1235356762513312,
"grad_norm": 0.22174575924873352,
"learning_rate": 0.0001964705299102376,
"loss": 0.4385,
"step": 29
},
{
"epoch": 0.12779552715654952,
"grad_norm": 0.244963139295578,
"learning_rate": 0.00019609173219450998,
"loss": 0.7168,
"step": 30
},
{
"epoch": 0.13205537806176784,
"grad_norm": 0.32758575677871704,
"learning_rate": 0.0001956940335732209,
"loss": 0.7231,
"step": 31
},
{
"epoch": 0.13631522896698617,
"grad_norm": 0.21992172300815582,
"learning_rate": 0.00019527751227228963,
"loss": 0.662,
"step": 32
},
{
"epoch": 0.14057507987220447,
"grad_norm": 0.2899262309074402,
"learning_rate": 0.0001948422502199903,
"loss": 0.4651,
"step": 33
},
{
"epoch": 0.1448349307774228,
"grad_norm": 0.23878340423107147,
"learning_rate": 0.00019438833303083678,
"loss": 0.5367,
"step": 34
},
{
"epoch": 0.14909478168264112,
"grad_norm": 0.20475314557552338,
"learning_rate": 0.0001939158499887428,
"loss": 0.4024,
"step": 35
},
{
"epoch": 0.15335463258785942,
"grad_norm": 0.25068745017051697,
"learning_rate": 0.00019342489402945998,
"loss": 0.6575,
"step": 36
},
{
"epoch": 0.15761448349307774,
"grad_norm": 0.3811924159526825,
"learning_rate": 0.00019291556172229785,
"loss": 0.6405,
"step": 37
},
{
"epoch": 0.16187433439829607,
"grad_norm": 0.2627577483654022,
"learning_rate": 0.0001923879532511287,
"loss": 0.6961,
"step": 38
},
{
"epoch": 0.16613418530351437,
"grad_norm": 0.32665154337882996,
"learning_rate": 0.00019184217239468212,
"loss": 0.6983,
"step": 39
},
{
"epoch": 0.1703940362087327,
"grad_norm": 0.24597743153572083,
"learning_rate": 0.00019127832650613189,
"loss": 0.54,
"step": 40
},
{
"epoch": 0.17465388711395102,
"grad_norm": 0.2611660361289978,
"learning_rate": 0.00019069652649198005,
"loss": 0.6281,
"step": 41
},
{
"epoch": 0.17891373801916932,
"grad_norm": 0.2969326078891754,
"learning_rate": 0.0001900968867902419,
"loss": 0.6817,
"step": 42
},
{
"epoch": 0.18317358892438765,
"grad_norm": 0.27561935782432556,
"learning_rate": 0.00018947952534793661,
"loss": 0.626,
"step": 43
},
{
"epoch": 0.18743343982960597,
"grad_norm": 0.33468887209892273,
"learning_rate": 0.00018884456359788724,
"loss": 0.7383,
"step": 44
},
{
"epoch": 0.19169329073482427,
"grad_norm": 0.2937297224998474,
"learning_rate": 0.0001881921264348355,
"loss": 0.6972,
"step": 45
},
{
"epoch": 0.1959531416400426,
"grad_norm": 0.33218011260032654,
"learning_rate": 0.00018752234219087538,
"loss": 0.6749,
"step": 46
},
{
"epoch": 0.20021299254526093,
"grad_norm": 0.2661404311656952,
"learning_rate": 0.00018683534261021057,
"loss": 0.4882,
"step": 47
},
{
"epoch": 0.20447284345047922,
"grad_norm": 0.2451002150774002,
"learning_rate": 0.00018613126282324092,
"loss": 0.637,
"step": 48
},
{
"epoch": 0.20873269435569755,
"grad_norm": 0.27517661452293396,
"learning_rate": 0.00018541024131998274,
"loss": 0.5483,
"step": 49
},
{
"epoch": 0.21299254526091588,
"grad_norm": 0.24373459815979004,
"learning_rate": 0.00018467241992282843,
"loss": 0.5112,
"step": 50
},
{
"epoch": 0.21725239616613418,
"grad_norm": 0.3239864408969879,
"learning_rate": 0.00018391794375865024,
"loss": 0.8005,
"step": 51
},
{
"epoch": 0.2215122470713525,
"grad_norm": 0.29262682795524597,
"learning_rate": 0.00018314696123025454,
"loss": 0.6769,
"step": 52
},
{
"epoch": 0.22577209797657083,
"grad_norm": 0.28277888894081116,
"learning_rate": 0.00018235962398719147,
"loss": 0.6892,
"step": 53
},
{
"epoch": 0.23003194888178913,
"grad_norm": 0.41741546988487244,
"learning_rate": 0.00018155608689592604,
"loss": 0.6763,
"step": 54
},
{
"epoch": 0.23429179978700745,
"grad_norm": 0.2734082043170929,
"learning_rate": 0.00018073650800937624,
"loss": 0.697,
"step": 55
},
{
"epoch": 0.23855165069222578,
"grad_norm": 0.2646290957927704,
"learning_rate": 0.00017990104853582493,
"loss": 0.5936,
"step": 56
},
{
"epoch": 0.24281150159744408,
"grad_norm": 0.27723610401153564,
"learning_rate": 0.00017904987280721035,
"loss": 0.5875,
"step": 57
},
{
"epoch": 0.2470713525026624,
"grad_norm": 0.2668153643608093,
"learning_rate": 0.000178183148246803,
"loss": 0.5219,
"step": 58
},
{
"epoch": 0.25133120340788073,
"grad_norm": 0.29033368825912476,
"learning_rate": 0.0001773010453362737,
"loss": 0.5997,
"step": 59
},
{
"epoch": 0.25133120340788073,
"eval_loss": 0.5784963965415955,
"eval_runtime": 17.4317,
"eval_samples_per_second": 22.66,
"eval_steps_per_second": 2.868,
"step": 59
},
{
"epoch": 0.25559105431309903,
"grad_norm": 0.2783537209033966,
"learning_rate": 0.00017640373758216077,
"loss": 0.483,
"step": 60
},
{
"epoch": 0.2598509052183174,
"grad_norm": 0.31082215905189514,
"learning_rate": 0.0001754914014817416,
"loss": 0.6473,
"step": 61
},
{
"epoch": 0.2641107561235357,
"grad_norm": 0.3206618130207062,
"learning_rate": 0.00017456421648831655,
"loss": 0.6289,
"step": 62
},
{
"epoch": 0.268370607028754,
"grad_norm": 0.2875254154205322,
"learning_rate": 0.00017362236497591094,
"loss": 0.594,
"step": 63
},
{
"epoch": 0.27263045793397234,
"grad_norm": 0.22950579226016998,
"learning_rate": 0.0001726660322034027,
"loss": 0.3886,
"step": 64
},
{
"epoch": 0.27689030883919064,
"grad_norm": 0.24293649196624756,
"learning_rate": 0.00017169540627808274,
"loss": 0.6129,
"step": 65
},
{
"epoch": 0.28115015974440893,
"grad_norm": 0.2611636519432068,
"learning_rate": 0.00017071067811865476,
"loss": 0.6891,
"step": 66
},
{
"epoch": 0.2854100106496273,
"grad_norm": 0.284407377243042,
"learning_rate": 0.00016971204141768233,
"loss": 0.516,
"step": 67
},
{
"epoch": 0.2896698615548456,
"grad_norm": 0.21485944092273712,
"learning_rate": 0.00016869969260349018,
"loss": 0.3826,
"step": 68
},
{
"epoch": 0.2939297124600639,
"grad_norm": 0.29337963461875916,
"learning_rate": 0.00016767383080152742,
"loss": 0.5696,
"step": 69
},
{
"epoch": 0.29818956336528224,
"grad_norm": 0.27099764347076416,
"learning_rate": 0.0001666346577952004,
"loss": 0.4708,
"step": 70
},
{
"epoch": 0.30244941427050054,
"grad_norm": 0.29055824875831604,
"learning_rate": 0.00016558237798618245,
"loss": 0.5844,
"step": 71
},
{
"epoch": 0.30670926517571884,
"grad_norm": 0.22874757647514343,
"learning_rate": 0.00016451719835420877,
"loss": 0.4412,
"step": 72
},
{
"epoch": 0.3109691160809372,
"grad_norm": 0.2926221489906311,
"learning_rate": 0.00016343932841636456,
"loss": 0.5757,
"step": 73
},
{
"epoch": 0.3152289669861555,
"grad_norm": 0.30070438981056213,
"learning_rate": 0.00016234898018587337,
"loss": 0.6063,
"step": 74
},
{
"epoch": 0.3194888178913738,
"grad_norm": 0.2475481927394867,
"learning_rate": 0.00016124636813039502,
"loss": 0.5056,
"step": 75
},
{
"epoch": 0.32374866879659214,
"grad_norm": 0.2851349711418152,
"learning_rate": 0.00016013170912984058,
"loss": 0.7547,
"step": 76
},
{
"epoch": 0.32800851970181044,
"grad_norm": 0.25569260120391846,
"learning_rate": 0.00015900522243371282,
"loss": 0.5168,
"step": 77
},
{
"epoch": 0.33226837060702874,
"grad_norm": 0.3774610757827759,
"learning_rate": 0.0001578671296179806,
"loss": 0.6691,
"step": 78
},
{
"epoch": 0.3365282215122471,
"grad_norm": 0.2339468151330948,
"learning_rate": 0.00015671765454149559,
"loss": 0.5021,
"step": 79
},
{
"epoch": 0.3407880724174654,
"grad_norm": 0.3066350519657135,
"learning_rate": 0.00015555702330196023,
"loss": 0.6838,
"step": 80
},
{
"epoch": 0.3450479233226837,
"grad_norm": 0.271908164024353,
"learning_rate": 0.00015438546419145488,
"loss": 0.4837,
"step": 81
},
{
"epoch": 0.34930777422790205,
"grad_norm": 0.304290771484375,
"learning_rate": 0.00015320320765153367,
"loss": 0.6768,
"step": 82
},
{
"epoch": 0.35356762513312034,
"grad_norm": 0.25685280561447144,
"learning_rate": 0.00015201048622789747,
"loss": 0.4335,
"step": 83
},
{
"epoch": 0.35782747603833864,
"grad_norm": 0.3003567159175873,
"learning_rate": 0.00015080753452465296,
"loss": 0.5836,
"step": 84
},
{
"epoch": 0.362087326943557,
"grad_norm": 0.2585873007774353,
"learning_rate": 0.0001495945891581668,
"loss": 0.5391,
"step": 85
},
{
"epoch": 0.3663471778487753,
"grad_norm": 0.30791282653808594,
"learning_rate": 0.000148371888710524,
"loss": 0.5103,
"step": 86
},
{
"epoch": 0.3706070287539936,
"grad_norm": 0.23016773164272308,
"learning_rate": 0.0001471396736825998,
"loss": 0.4269,
"step": 87
},
{
"epoch": 0.37486687965921195,
"grad_norm": 0.3137454390525818,
"learning_rate": 0.00014589818644675378,
"loss": 0.5116,
"step": 88
},
{
"epoch": 0.37912673056443025,
"grad_norm": 0.28078484535217285,
"learning_rate": 0.00014464767119915629,
"loss": 0.4388,
"step": 89
},
{
"epoch": 0.38338658146964855,
"grad_norm": 0.3163893222808838,
"learning_rate": 0.00014338837391175582,
"loss": 0.6122,
"step": 90
},
{
"epoch": 0.3876464323748669,
"grad_norm": 0.34674668312072754,
"learning_rate": 0.0001421205422838971,
"loss": 0.7114,
"step": 91
},
{
"epoch": 0.3919062832800852,
"grad_norm": 0.2210942953824997,
"learning_rate": 0.00014084442569359964,
"loss": 0.3351,
"step": 92
},
{
"epoch": 0.3961661341853035,
"grad_norm": 0.30586308240890503,
"learning_rate": 0.0001395602751485059,
"loss": 0.4845,
"step": 93
},
{
"epoch": 0.40042598509052185,
"grad_norm": 0.2695784568786621,
"learning_rate": 0.000138268343236509,
"loss": 0.4992,
"step": 94
},
{
"epoch": 0.40468583599574015,
"grad_norm": 0.2989813983440399,
"learning_rate": 0.00013696888407606952,
"loss": 0.585,
"step": 95
},
{
"epoch": 0.40894568690095845,
"grad_norm": 0.2759920656681061,
"learning_rate": 0.0001356621532662313,
"loss": 0.4492,
"step": 96
},
{
"epoch": 0.4132055378061768,
"grad_norm": 0.33117353916168213,
"learning_rate": 0.0001343484078363461,
"loss": 0.5606,
"step": 97
},
{
"epoch": 0.4174653887113951,
"grad_norm": 0.24572253227233887,
"learning_rate": 0.00013302790619551674,
"loss": 0.3261,
"step": 98
},
{
"epoch": 0.4217252396166134,
"grad_norm": 0.322480171918869,
"learning_rate": 0.00013170090808176883,
"loss": 0.5527,
"step": 99
},
{
"epoch": 0.42598509052183176,
"grad_norm": 0.3101179301738739,
"learning_rate": 0.00013036767451096148,
"loss": 0.5419,
"step": 100
},
{
"epoch": 0.43024494142705005,
"grad_norm": 0.3218703269958496,
"learning_rate": 0.00012902846772544624,
"loss": 0.5441,
"step": 101
},
{
"epoch": 0.43450479233226835,
"grad_norm": 0.26214686036109924,
"learning_rate": 0.00012768355114248494,
"loss": 0.5388,
"step": 102
},
{
"epoch": 0.4387646432374867,
"grad_norm": 0.421612411737442,
"learning_rate": 0.00012633318930243648,
"loss": 0.7557,
"step": 103
},
{
"epoch": 0.443024494142705,
"grad_norm": 0.5120344758033752,
"learning_rate": 0.0001249776478167227,
"loss": 0.7028,
"step": 104
},
{
"epoch": 0.4472843450479233,
"grad_norm": 0.27614736557006836,
"learning_rate": 0.00012361719331558345,
"loss": 0.3954,
"step": 105
},
{
"epoch": 0.45154419595314166,
"grad_norm": 0.269520103931427,
"learning_rate": 0.00012225209339563145,
"loss": 0.4851,
"step": 106
},
{
"epoch": 0.45580404685835996,
"grad_norm": 0.2739225924015045,
"learning_rate": 0.000120882616567217,
"loss": 0.4907,
"step": 107
},
{
"epoch": 0.46006389776357826,
"grad_norm": 0.33920663595199585,
"learning_rate": 0.00011950903220161285,
"loss": 0.6288,
"step": 108
},
{
"epoch": 0.4643237486687966,
"grad_norm": 0.279832124710083,
"learning_rate": 0.00011813161047802985,
"loss": 0.447,
"step": 109
},
{
"epoch": 0.4685835995740149,
"grad_norm": 0.31790605187416077,
"learning_rate": 0.00011675062233047364,
"loss": 0.5933,
"step": 110
},
{
"epoch": 0.4728434504792332,
"grad_norm": 0.24926939606666565,
"learning_rate": 0.000115366339394453,
"loss": 0.4061,
"step": 111
},
{
"epoch": 0.47710330138445156,
"grad_norm": 0.3327280282974243,
"learning_rate": 0.00011397903395354996,
"loss": 0.484,
"step": 112
},
{
"epoch": 0.48136315228966986,
"grad_norm": 0.37822094559669495,
"learning_rate": 0.00011258897888586255,
"loss": 0.6416,
"step": 113
},
{
"epoch": 0.48562300319488816,
"grad_norm": 0.35605669021606445,
"learning_rate": 0.00011119644761033078,
"loss": 0.6136,
"step": 114
},
{
"epoch": 0.4898828541001065,
"grad_norm": 0.3513132929801941,
"learning_rate": 0.0001098017140329561,
"loss": 0.6299,
"step": 115
},
{
"epoch": 0.4941427050053248,
"grad_norm": 0.3040708899497986,
"learning_rate": 0.00010840505249292476,
"loss": 0.4658,
"step": 116
},
{
"epoch": 0.4984025559105431,
"grad_norm": 0.19006308913230896,
"learning_rate": 0.00010700673770864673,
"loss": 0.2694,
"step": 117
},
{
"epoch": 0.5026624068157615,
"grad_norm": 0.30643633008003235,
"learning_rate": 0.00010560704472371919,
"loss": 0.4492,
"step": 118
},
{
"epoch": 0.5026624068157615,
"eval_loss": 0.5326976180076599,
"eval_runtime": 17.5872,
"eval_samples_per_second": 22.46,
"eval_steps_per_second": 2.843,
"step": 118
},
{
"epoch": 0.5069222577209798,
"grad_norm": 0.3698013722896576,
"learning_rate": 0.00010420624885282653,
"loss": 0.6993,
"step": 119
},
{
"epoch": 0.5111821086261981,
"grad_norm": 0.2801634967327118,
"learning_rate": 0.0001028046256275869,
"loss": 0.4059,
"step": 120
},
{
"epoch": 0.5154419595314164,
"grad_norm": 0.2864643931388855,
"learning_rate": 0.00010140245074235624,
"loss": 0.5024,
"step": 121
},
{
"epoch": 0.5197018104366348,
"grad_norm": 0.30105265974998474,
"learning_rate": 0.0001,
"loss": 0.6774,
"step": 122
},
{
"epoch": 0.5239616613418531,
"grad_norm": 0.39152050018310547,
"learning_rate": 9.859754925764378e-05,
"loss": 0.625,
"step": 123
},
{
"epoch": 0.5282215122470714,
"grad_norm": 0.3618883192539215,
"learning_rate": 9.719537437241312e-05,
"loss": 0.6978,
"step": 124
},
{
"epoch": 0.5324813631522897,
"grad_norm": 0.23670899868011475,
"learning_rate": 9.579375114717351e-05,
"loss": 0.3379,
"step": 125
},
{
"epoch": 0.536741214057508,
"grad_norm": 0.3124864101409912,
"learning_rate": 9.439295527628081e-05,
"loss": 0.525,
"step": 126
},
{
"epoch": 0.5410010649627263,
"grad_norm": 0.3667398989200592,
"learning_rate": 9.299326229135326e-05,
"loss": 0.6164,
"step": 127
},
{
"epoch": 0.5452609158679447,
"grad_norm": 0.2894105613231659,
"learning_rate": 9.159494750707526e-05,
"loss": 0.4335,
"step": 128
},
{
"epoch": 0.549520766773163,
"grad_norm": 0.30680200457572937,
"learning_rate": 9.019828596704394e-05,
"loss": 0.4507,
"step": 129
},
{
"epoch": 0.5537806176783813,
"grad_norm": 0.3676758110523224,
"learning_rate": 8.880355238966923e-05,
"loss": 0.5955,
"step": 130
},
{
"epoch": 0.5580404685835996,
"grad_norm": 0.3194178342819214,
"learning_rate": 8.741102111413748e-05,
"loss": 0.5675,
"step": 131
},
{
"epoch": 0.5623003194888179,
"grad_norm": 0.29750558733940125,
"learning_rate": 8.602096604645009e-05,
"loss": 0.5785,
"step": 132
},
{
"epoch": 0.5665601703940362,
"grad_norm": 0.37204545736312866,
"learning_rate": 8.463366060554698e-05,
"loss": 0.612,
"step": 133
},
{
"epoch": 0.5708200212992546,
"grad_norm": 0.36891940236091614,
"learning_rate": 8.324937766952638e-05,
"loss": 0.5463,
"step": 134
},
{
"epoch": 0.5750798722044729,
"grad_norm": 0.2863575518131256,
"learning_rate": 8.186838952197018e-05,
"loss": 0.4884,
"step": 135
},
{
"epoch": 0.5793397231096912,
"grad_norm": 0.354523241519928,
"learning_rate": 8.049096779838719e-05,
"loss": 0.7727,
"step": 136
},
{
"epoch": 0.5835995740149095,
"grad_norm": 0.30339759588241577,
"learning_rate": 7.911738343278304e-05,
"loss": 0.5543,
"step": 137
},
{
"epoch": 0.5878594249201278,
"grad_norm": 0.27778202295303345,
"learning_rate": 7.774790660436858e-05,
"loss": 0.4716,
"step": 138
},
{
"epoch": 0.5921192758253461,
"grad_norm": 0.38618960976600647,
"learning_rate": 7.63828066844166e-05,
"loss": 0.6519,
"step": 139
},
{
"epoch": 0.5963791267305645,
"grad_norm": 0.3573627769947052,
"learning_rate": 7.502235218327731e-05,
"loss": 0.5128,
"step": 140
},
{
"epoch": 0.6006389776357828,
"grad_norm": 0.30529165267944336,
"learning_rate": 7.366681069756352e-05,
"loss": 0.5184,
"step": 141
},
{
"epoch": 0.6048988285410011,
"grad_norm": 0.2819828987121582,
"learning_rate": 7.231644885751507e-05,
"loss": 0.4259,
"step": 142
},
{
"epoch": 0.6091586794462194,
"grad_norm": 0.32307252287864685,
"learning_rate": 7.097153227455379e-05,
"loss": 0.6048,
"step": 143
},
{
"epoch": 0.6134185303514377,
"grad_norm": 0.31262722611427307,
"learning_rate": 6.963232548903853e-05,
"loss": 0.4834,
"step": 144
},
{
"epoch": 0.617678381256656,
"grad_norm": 0.318851500749588,
"learning_rate": 6.829909191823121e-05,
"loss": 0.5011,
"step": 145
},
{
"epoch": 0.6219382321618744,
"grad_norm": 0.44246405363082886,
"learning_rate": 6.697209380448333e-05,
"loss": 0.4384,
"step": 146
},
{
"epoch": 0.6261980830670927,
"grad_norm": 0.3459945023059845,
"learning_rate": 6.565159216365389e-05,
"loss": 0.5657,
"step": 147
},
{
"epoch": 0.630457933972311,
"grad_norm": 0.33843329548835754,
"learning_rate": 6.43378467337687e-05,
"loss": 0.5711,
"step": 148
},
{
"epoch": 0.6347177848775293,
"grad_norm": 0.3812694549560547,
"learning_rate": 6.30311159239305e-05,
"loss": 0.6142,
"step": 149
},
{
"epoch": 0.6389776357827476,
"grad_norm": 0.29333916306495667,
"learning_rate": 6.173165676349103e-05,
"loss": 0.585,
"step": 150
},
{
"epoch": 0.6432374866879659,
"grad_norm": 0.2884041666984558,
"learning_rate": 6.043972485149414e-05,
"loss": 0.4866,
"step": 151
},
{
"epoch": 0.6474973375931843,
"grad_norm": 0.33954814076423645,
"learning_rate": 5.9155574306400395e-05,
"loss": 0.571,
"step": 152
},
{
"epoch": 0.6517571884984026,
"grad_norm": 0.33935782313346863,
"learning_rate": 5.787945771610296e-05,
"loss": 0.5037,
"step": 153
},
{
"epoch": 0.6560170394036209,
"grad_norm": 0.27371054887771606,
"learning_rate": 5.6611626088244194e-05,
"loss": 0.3322,
"step": 154
},
{
"epoch": 0.6602768903088392,
"grad_norm": 0.30788496136665344,
"learning_rate": 5.5352328800843724e-05,
"loss": 0.4454,
"step": 155
},
{
"epoch": 0.6645367412140575,
"grad_norm": 0.34366151690483093,
"learning_rate": 5.410181355324622e-05,
"loss": 0.5788,
"step": 156
},
{
"epoch": 0.6687965921192758,
"grad_norm": 0.33698371052742004,
"learning_rate": 5.286032631740023e-05,
"loss": 0.4378,
"step": 157
},
{
"epoch": 0.6730564430244942,
"grad_norm": 0.4181162416934967,
"learning_rate": 5.162811128947602e-05,
"loss": 0.5367,
"step": 158
},
{
"epoch": 0.6773162939297125,
"grad_norm": 0.4480881690979004,
"learning_rate": 5.0405410841833253e-05,
"loss": 0.6633,
"step": 159
},
{
"epoch": 0.6815761448349308,
"grad_norm": 0.37488028407096863,
"learning_rate": 4.919246547534708e-05,
"loss": 0.5402,
"step": 160
},
{
"epoch": 0.6858359957401491,
"grad_norm": 0.2964366376399994,
"learning_rate": 4.7989513772102537e-05,
"loss": 0.4109,
"step": 161
},
{
"epoch": 0.6900958466453674,
"grad_norm": 0.35376259684562683,
"learning_rate": 4.6796792348466356e-05,
"loss": 0.636,
"step": 162
},
{
"epoch": 0.6943556975505857,
"grad_norm": 0.3158915638923645,
"learning_rate": 4.561453580854516e-05,
"loss": 0.4893,
"step": 163
},
{
"epoch": 0.6986155484558041,
"grad_norm": 0.420785516500473,
"learning_rate": 4.444297669803981e-05,
"loss": 0.7147,
"step": 164
},
{
"epoch": 0.7028753993610224,
"grad_norm": 0.3272782564163208,
"learning_rate": 4.328234545850442e-05,
"loss": 0.3444,
"step": 165
},
{
"epoch": 0.7071352502662407,
"grad_norm": 0.30052492022514343,
"learning_rate": 4.213287038201943e-05,
"loss": 0.5209,
"step": 166
},
{
"epoch": 0.711395101171459,
"grad_norm": 0.37648481130599976,
"learning_rate": 4.0994777566287204e-05,
"loss": 0.684,
"step": 167
},
{
"epoch": 0.7156549520766773,
"grad_norm": 0.3135606646537781,
"learning_rate": 3.9868290870159405e-05,
"loss": 0.4871,
"step": 168
},
{
"epoch": 0.7199148029818956,
"grad_norm": 0.33847576379776,
"learning_rate": 3.875363186960499e-05,
"loss": 0.5294,
"step": 169
},
{
"epoch": 0.724174653887114,
"grad_norm": 0.3337070047855377,
"learning_rate": 3.7651019814126654e-05,
"loss": 0.4425,
"step": 170
},
{
"epoch": 0.7284345047923323,
"grad_norm": 0.4173165261745453,
"learning_rate": 3.6560671583635467e-05,
"loss": 0.637,
"step": 171
},
{
"epoch": 0.7326943556975506,
"grad_norm": 0.41098451614379883,
"learning_rate": 3.548280164579126e-05,
"loss": 0.52,
"step": 172
},
{
"epoch": 0.7369542066027689,
"grad_norm": 0.3789665699005127,
"learning_rate": 3.4417622013817595e-05,
"loss": 0.5995,
"step": 173
},
{
"epoch": 0.7412140575079872,
"grad_norm": 0.3996846675872803,
"learning_rate": 3.336534220479961e-05,
"loss": 0.6237,
"step": 174
},
{
"epoch": 0.7454739084132055,
"grad_norm": 0.3990687131881714,
"learning_rate": 3.2326169198472556e-05,
"loss": 0.555,
"step": 175
},
{
"epoch": 0.7497337593184239,
"grad_norm": 0.32280924916267395,
"learning_rate": 3.130030739650983e-05,
"loss": 0.4742,
"step": 176
},
{
"epoch": 0.7539936102236422,
"grad_norm": 0.4192362129688263,
"learning_rate": 3.0287958582317676e-05,
"loss": 0.6569,
"step": 177
},
{
"epoch": 0.7539936102236422,
"eval_loss": 0.5110090970993042,
"eval_runtime": 20.8066,
"eval_samples_per_second": 18.984,
"eval_steps_per_second": 2.403,
"step": 177
},
{
"epoch": 0.7582534611288605,
"grad_norm": 0.35410746932029724,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.4828,
"step": 178
},
{
"epoch": 0.7625133120340788,
"grad_norm": 0.4463326036930084,
"learning_rate": 2.8304593721917285e-05,
"loss": 0.6976,
"step": 179
},
{
"epoch": 0.7667731629392971,
"grad_norm": 0.29797378182411194,
"learning_rate": 2.7333967796597315e-05,
"loss": 0.564,
"step": 180
},
{
"epoch": 0.7710330138445154,
"grad_norm": 0.31337812542915344,
"learning_rate": 2.6377635024089087e-05,
"loss": 0.5607,
"step": 181
},
{
"epoch": 0.7752928647497338,
"grad_norm": 0.40470513701438904,
"learning_rate": 2.5435783511683443e-05,
"loss": 0.6428,
"step": 182
},
{
"epoch": 0.7795527156549521,
"grad_norm": 0.413817822933197,
"learning_rate": 2.450859851825842e-05,
"loss": 0.7303,
"step": 183
},
{
"epoch": 0.7838125665601704,
"grad_norm": 0.2931414842605591,
"learning_rate": 2.3596262417839255e-05,
"loss": 0.4051,
"step": 184
},
{
"epoch": 0.7880724174653887,
"grad_norm": 0.34086865186691284,
"learning_rate": 2.26989546637263e-05,
"loss": 0.4329,
"step": 185
},
{
"epoch": 0.792332268370607,
"grad_norm": 0.40336307883262634,
"learning_rate": 2.181685175319702e-05,
"loss": 0.5791,
"step": 186
},
{
"epoch": 0.7965921192758253,
"grad_norm": 0.30092838406562805,
"learning_rate": 2.095012719278966e-05,
"loss": 0.4491,
"step": 187
},
{
"epoch": 0.8008519701810437,
"grad_norm": 0.31043168902397156,
"learning_rate": 2.009895146417512e-05,
"loss": 0.4681,
"step": 188
},
{
"epoch": 0.805111821086262,
"grad_norm": 0.3712119162082672,
"learning_rate": 1.926349199062376e-05,
"loss": 0.549,
"step": 189
},
{
"epoch": 0.8093716719914803,
"grad_norm": 0.3679051995277405,
"learning_rate": 1.8443913104073983e-05,
"loss": 0.4971,
"step": 190
},
{
"epoch": 0.8136315228966986,
"grad_norm": 0.3244669735431671,
"learning_rate": 1.7640376012808536e-05,
"loss": 0.4732,
"step": 191
},
{
"epoch": 0.8178913738019169,
"grad_norm": 0.28653696179389954,
"learning_rate": 1.6853038769745467e-05,
"loss": 0.3469,
"step": 192
},
{
"epoch": 0.8221512247071352,
"grad_norm": 0.3144218325614929,
"learning_rate": 1.6082056241349786e-05,
"loss": 0.5127,
"step": 193
},
{
"epoch": 0.8264110756123536,
"grad_norm": 0.3801470994949341,
"learning_rate": 1.5327580077171587e-05,
"loss": 0.5178,
"step": 194
},
{
"epoch": 0.8306709265175719,
"grad_norm": 0.37223386764526367,
"learning_rate": 1.4589758680017263e-05,
"loss": 0.5114,
"step": 195
},
{
"epoch": 0.8349307774227902,
"grad_norm": 0.4167802333831787,
"learning_rate": 1.3868737176759106e-05,
"loss": 0.6949,
"step": 196
},
{
"epoch": 0.8391906283280085,
"grad_norm": 0.620794951915741,
"learning_rate": 1.3164657389789458e-05,
"loss": 0.7015,
"step": 197
},
{
"epoch": 0.8434504792332268,
"grad_norm": 0.32053133845329285,
"learning_rate": 1.2477657809124631e-05,
"loss": 0.5328,
"step": 198
},
{
"epoch": 0.8477103301384451,
"grad_norm": 0.41892528533935547,
"learning_rate": 1.1807873565164506e-05,
"loss": 0.5929,
"step": 199
},
{
"epoch": 0.8519701810436635,
"grad_norm": 0.2980664372444153,
"learning_rate": 1.1155436402112785e-05,
"loss": 0.4209,
"step": 200
},
{
"epoch": 0.8562300319488818,
"grad_norm": 0.3290930986404419,
"learning_rate": 1.0520474652063394e-05,
"loss": 0.4423,
"step": 201
},
{
"epoch": 0.8604898828541001,
"grad_norm": 0.3246372640132904,
"learning_rate": 9.903113209758096e-06,
"loss": 0.4631,
"step": 202
},
{
"epoch": 0.8647497337593184,
"grad_norm": 0.3644905388355255,
"learning_rate": 9.303473508019944e-06,
"loss": 0.552,
"step": 203
},
{
"epoch": 0.8690095846645367,
"grad_norm": 0.49974295496940613,
"learning_rate": 8.72167349386811e-06,
"loss": 0.7516,
"step": 204
},
{
"epoch": 0.873269435569755,
"grad_norm": 0.3242340087890625,
"learning_rate": 8.157827605317892e-06,
"loss": 0.412,
"step": 205
},
{
"epoch": 0.8775292864749734,
"grad_norm": 0.33690881729125977,
"learning_rate": 7.612046748871327e-06,
"loss": 0.4841,
"step": 206
},
{
"epoch": 0.8817891373801917,
"grad_norm": 0.3246766924858093,
"learning_rate": 7.084438277702188e-06,
"loss": 0.4341,
"step": 207
},
{
"epoch": 0.88604898828541,
"grad_norm": 0.4262131452560425,
"learning_rate": 6.5751059705400295e-06,
"loss": 0.6306,
"step": 208
},
{
"epoch": 0.8903088391906283,
"grad_norm": 0.32158687710762024,
"learning_rate": 6.084150011257239e-06,
"loss": 0.4687,
"step": 209
},
{
"epoch": 0.8945686900958466,
"grad_norm": 0.377208948135376,
"learning_rate": 5.611666969163243e-06,
"loss": 0.5849,
"step": 210
},
{
"epoch": 0.898828541001065,
"grad_norm": 0.30956804752349854,
"learning_rate": 5.157749780009735e-06,
"loss": 0.4355,
"step": 211
},
{
"epoch": 0.9030883919062833,
"grad_norm": 0.4885202944278717,
"learning_rate": 4.722487727710368e-06,
"loss": 0.6129,
"step": 212
},
{
"epoch": 0.9073482428115016,
"grad_norm": 0.3384571075439453,
"learning_rate": 4.305966426779118e-06,
"loss": 0.4345,
"step": 213
},
{
"epoch": 0.9116080937167199,
"grad_norm": 0.4629303514957428,
"learning_rate": 3.908267805490051e-06,
"loss": 0.6158,
"step": 214
},
{
"epoch": 0.9158679446219382,
"grad_norm": 0.3206894099712372,
"learning_rate": 3.529470089762421e-06,
"loss": 0.4689,
"step": 215
},
{
"epoch": 0.9201277955271565,
"grad_norm": 0.41424816846847534,
"learning_rate": 3.169647787773866e-06,
"loss": 0.5235,
"step": 216
},
{
"epoch": 0.9243876464323749,
"grad_norm": 0.3189912736415863,
"learning_rate": 2.8288716753049005e-06,
"loss": 0.4262,
"step": 217
},
{
"epoch": 0.9286474973375932,
"grad_norm": 0.3202993869781494,
"learning_rate": 2.5072087818176382e-06,
"loss": 0.493,
"step": 218
},
{
"epoch": 0.9329073482428115,
"grad_norm": 0.32974228262901306,
"learning_rate": 2.20472237727124e-06,
"loss": 0.5512,
"step": 219
},
{
"epoch": 0.9371671991480298,
"grad_norm": 0.27346375584602356,
"learning_rate": 1.921471959676957e-06,
"loss": 0.4154,
"step": 220
},
{
"epoch": 0.9414270500532481,
"grad_norm": 0.29039615392684937,
"learning_rate": 1.657513243395159e-06,
"loss": 0.4367,
"step": 221
},
{
"epoch": 0.9456869009584664,
"grad_norm": 0.3864074647426605,
"learning_rate": 1.4128981481764115e-06,
"loss": 0.5531,
"step": 222
},
{
"epoch": 0.9499467518636848,
"grad_norm": 0.4020756185054779,
"learning_rate": 1.1876747889491223e-06,
"loss": 0.6631,
"step": 223
},
{
"epoch": 0.9542066027689031,
"grad_norm": 0.37475478649139404,
"learning_rate": 9.818874663554357e-07,
"loss": 0.4979,
"step": 224
},
{
"epoch": 0.9584664536741214,
"grad_norm": 0.3484041690826416,
"learning_rate": 7.955766580375335e-07,
"loss": 0.5207,
"step": 225
},
{
"epoch": 0.9627263045793397,
"grad_norm": 0.3385999798774719,
"learning_rate": 6.287790106757396e-07,
"loss": 0.4832,
"step": 226
},
{
"epoch": 0.966986155484558,
"grad_norm": 0.27909693121910095,
"learning_rate": 4.815273327803182e-07,
"loss": 0.3506,
"step": 227
},
{
"epoch": 0.9712460063897763,
"grad_norm": 0.34196606278419495,
"learning_rate": 3.5385058823809156e-07,
"loss": 0.5283,
"step": 228
},
{
"epoch": 0.9755058572949947,
"grad_norm": 0.39571547508239746,
"learning_rate": 2.457738906153972e-07,
"loss": 0.5099,
"step": 229
},
{
"epoch": 0.979765708200213,
"grad_norm": 0.4107287526130676,
"learning_rate": 1.5731849821833954e-07,
"loss": 0.5042,
"step": 230
},
{
"epoch": 0.9840255591054313,
"grad_norm": 0.3254135251045227,
"learning_rate": 8.850180991131219e-08,
"loss": 0.4929,
"step": 231
},
{
"epoch": 0.9882854100106496,
"grad_norm": 0.2778495252132416,
"learning_rate": 3.933736169471347e-08,
"loss": 0.3571,
"step": 232
},
{
"epoch": 0.9925452609158679,
"grad_norm": 0.4703886806964874,
"learning_rate": 9.834824042498358e-09,
"loss": 0.7013,
"step": 233
},
{
"epoch": 0.9968051118210862,
"grad_norm": 0.3349379599094391,
"learning_rate": 0.0,
"loss": 0.4725,
"step": 234
}
],
"logging_steps": 1,
"max_steps": 234,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1191352754700288e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}