RoyJoy's picture
Training in progress, step 250, checkpoint
d47edbb verified
raw
history blame
46.6 kB
{
"best_metric": 1.6708096265792847,
"best_model_checkpoint": "miner_id_24/checkpoint-250",
"epoch": 1.9065776930409915,
"eval_steps": 25,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0076263107721639654,
"grad_norm": 0.9196067452430725,
"learning_rate": 4.285714285714285e-05,
"loss": 3.2672,
"step": 1
},
{
"epoch": 0.0076263107721639654,
"eval_loss": 3.787529230117798,
"eval_runtime": 0.2862,
"eval_samples_per_second": 174.68,
"eval_steps_per_second": 45.417,
"step": 1
},
{
"epoch": 0.015252621544327931,
"grad_norm": 0.8998702168464661,
"learning_rate": 8.57142857142857e-05,
"loss": 3.4083,
"step": 2
},
{
"epoch": 0.022878932316491896,
"grad_norm": 0.9599621295928955,
"learning_rate": 0.00012857142857142855,
"loss": 3.5199,
"step": 3
},
{
"epoch": 0.030505243088655862,
"grad_norm": 0.9405797123908997,
"learning_rate": 0.0001714285714285714,
"loss": 3.5021,
"step": 4
},
{
"epoch": 0.03813155386081983,
"grad_norm": 0.9620326161384583,
"learning_rate": 0.00021428571428571427,
"loss": 3.5165,
"step": 5
},
{
"epoch": 0.04575786463298379,
"grad_norm": 0.9596439003944397,
"learning_rate": 0.0002571428571428571,
"loss": 3.4724,
"step": 6
},
{
"epoch": 0.05338417540514776,
"grad_norm": 0.917304277420044,
"learning_rate": 0.0003,
"loss": 3.4023,
"step": 7
},
{
"epoch": 0.061010486177311724,
"grad_norm": 0.8579983711242676,
"learning_rate": 0.0002999898347482845,
"loss": 3.3249,
"step": 8
},
{
"epoch": 0.06863679694947569,
"grad_norm": 0.8907164335250854,
"learning_rate": 0.00029995934052398757,
"loss": 3.2235,
"step": 9
},
{
"epoch": 0.07626310772163966,
"grad_norm": 0.9348911046981812,
"learning_rate": 0.00029990852191942715,
"loss": 3.1403,
"step": 10
},
{
"epoch": 0.08388941849380362,
"grad_norm": 0.9178524613380432,
"learning_rate": 0.0002998373865876983,
"loss": 3.1472,
"step": 11
},
{
"epoch": 0.09151572926596759,
"grad_norm": 1.1779893636703491,
"learning_rate": 0.0002997459452415201,
"loss": 3.1228,
"step": 12
},
{
"epoch": 0.09914204003813155,
"grad_norm": 1.2700427770614624,
"learning_rate": 0.00029963421165162316,
"loss": 2.8118,
"step": 13
},
{
"epoch": 0.10676835081029552,
"grad_norm": 1.2745684385299683,
"learning_rate": 0.00029950220264467496,
"loss": 2.5948,
"step": 14
},
{
"epoch": 0.11439466158245949,
"grad_norm": 1.0857181549072266,
"learning_rate": 0.0002993499381007466,
"loss": 2.5336,
"step": 15
},
{
"epoch": 0.12202097235462345,
"grad_norm": 0.926810622215271,
"learning_rate": 0.00029917744095031806,
"loss": 2.5163,
"step": 16
},
{
"epoch": 0.12964728312678742,
"grad_norm": 0.7827774882316589,
"learning_rate": 0.0002989847371708258,
"loss": 2.4039,
"step": 17
},
{
"epoch": 0.13727359389895138,
"grad_norm": 0.6036872863769531,
"learning_rate": 0.00029877185578275025,
"loss": 2.3962,
"step": 18
},
{
"epoch": 0.14489990467111535,
"grad_norm": 0.8695657253265381,
"learning_rate": 0.0002985388288452454,
"loss": 2.3643,
"step": 19
},
{
"epoch": 0.15252621544327932,
"grad_norm": 0.9263712763786316,
"learning_rate": 0.0002982856914513109,
"loss": 2.3404,
"step": 20
},
{
"epoch": 0.1601525262154433,
"grad_norm": 0.41602811217308044,
"learning_rate": 0.00029801248172250705,
"loss": 2.3645,
"step": 21
},
{
"epoch": 0.16777883698760723,
"grad_norm": 0.8675795197486877,
"learning_rate": 0.0002977192408032142,
"loss": 2.3502,
"step": 22
},
{
"epoch": 0.1754051477597712,
"grad_norm": 0.9460420608520508,
"learning_rate": 0.0002974060128544361,
"loss": 2.374,
"step": 23
},
{
"epoch": 0.18303145853193517,
"grad_norm": 0.6238827705383301,
"learning_rate": 0.0002970728450471497,
"loss": 2.4411,
"step": 24
},
{
"epoch": 0.19065776930409914,
"grad_norm": 0.6706355214118958,
"learning_rate": 0.0002967197875552013,
"loss": 2.5664,
"step": 25
},
{
"epoch": 0.19065776930409914,
"eval_loss": 2.3339245319366455,
"eval_runtime": 0.2802,
"eval_samples_per_second": 178.467,
"eval_steps_per_second": 46.402,
"step": 25
},
{
"epoch": 0.1982840800762631,
"grad_norm": 1.384001612663269,
"learning_rate": 0.0002963468935477506,
"loss": 2.1595,
"step": 26
},
{
"epoch": 0.20591039084842708,
"grad_norm": 0.8498388528823853,
"learning_rate": 0.00029595421918126344,
"loss": 2.109,
"step": 27
},
{
"epoch": 0.21353670162059105,
"grad_norm": 0.3540984094142914,
"learning_rate": 0.00029554182359105497,
"loss": 2.1542,
"step": 28
},
{
"epoch": 0.22116301239275502,
"grad_norm": 0.6427699327468872,
"learning_rate": 0.00029510976888238435,
"loss": 2.1396,
"step": 29
},
{
"epoch": 0.22878932316491898,
"grad_norm": 0.9017099738121033,
"learning_rate": 0.0002946581201211013,
"loss": 2.1525,
"step": 30
},
{
"epoch": 0.23641563393708293,
"grad_norm": 0.5602626204490662,
"learning_rate": 0.00029418694532384816,
"loss": 2.1099,
"step": 31
},
{
"epoch": 0.2440419447092469,
"grad_norm": 0.35870033502578735,
"learning_rate": 0.0002936963154478161,
"loss": 2.073,
"step": 32
},
{
"epoch": 0.25166825548141086,
"grad_norm": 0.4111023545265198,
"learning_rate": 0.0002931863043800599,
"loss": 2.1075,
"step": 33
},
{
"epoch": 0.25929456625357483,
"grad_norm": 0.5750699043273926,
"learning_rate": 0.00029265698892637034,
"loss": 2.0506,
"step": 34
},
{
"epoch": 0.2669208770257388,
"grad_norm": 0.687319278717041,
"learning_rate": 0.00029210844879970775,
"loss": 2.154,
"step": 35
},
{
"epoch": 0.27454718779790277,
"grad_norm": 0.49932411313056946,
"learning_rate": 0.0002915407666081976,
"loss": 2.165,
"step": 36
},
{
"epoch": 0.28217349857006674,
"grad_norm": 0.48046040534973145,
"learning_rate": 0.0002909540278426897,
"loss": 2.1741,
"step": 37
},
{
"epoch": 0.2897998093422307,
"grad_norm": 0.797716498374939,
"learning_rate": 0.0002903483208638841,
"loss": 2.1329,
"step": 38
},
{
"epoch": 0.2974261201143947,
"grad_norm": 0.425375372171402,
"learning_rate": 0.0002897237368890237,
"loss": 1.916,
"step": 39
},
{
"epoch": 0.30505243088655865,
"grad_norm": 0.40925440192222595,
"learning_rate": 0.0002890803699781578,
"loss": 1.904,
"step": 40
},
{
"epoch": 0.3126787416587226,
"grad_norm": 0.575661838054657,
"learning_rate": 0.0002884183170199766,
"loss": 1.9828,
"step": 41
},
{
"epoch": 0.3203050524308866,
"grad_norm": 0.7085393667221069,
"learning_rate": 0.0002877376777172205,
"loss": 1.9275,
"step": 42
},
{
"epoch": 0.3279313632030505,
"grad_norm": 0.4340769648551941,
"learning_rate": 0.00028703855457166483,
"loss": 1.9285,
"step": 43
},
{
"epoch": 0.33555767397521447,
"grad_norm": 0.32672634720802307,
"learning_rate": 0.00028632105286868374,
"loss": 1.9382,
"step": 44
},
{
"epoch": 0.34318398474737843,
"grad_norm": 0.3794914782047272,
"learning_rate": 0.0002855852806613945,
"loss": 1.8947,
"step": 45
},
{
"epoch": 0.3508102955195424,
"grad_norm": 0.37154510617256165,
"learning_rate": 0.00028483134875438527,
"loss": 1.9039,
"step": 46
},
{
"epoch": 0.35843660629170637,
"grad_norm": 0.3726259768009186,
"learning_rate": 0.0002840593706870279,
"loss": 1.9194,
"step": 47
},
{
"epoch": 0.36606291706387034,
"grad_norm": 0.370515912771225,
"learning_rate": 0.00028326946271637986,
"loss": 1.8939,
"step": 48
},
{
"epoch": 0.3736892278360343,
"grad_norm": 0.42510128021240234,
"learning_rate": 0.00028246174379967606,
"loss": 1.9624,
"step": 49
},
{
"epoch": 0.3813155386081983,
"grad_norm": 0.531184732913971,
"learning_rate": 0.0002816363355764142,
"loss": 2.0973,
"step": 50
},
{
"epoch": 0.3813155386081983,
"eval_loss": 1.9253315925598145,
"eval_runtime": 0.2792,
"eval_samples_per_second": 179.063,
"eval_steps_per_second": 46.557,
"step": 50
},
{
"epoch": 0.38894184938036225,
"grad_norm": 1.3231308460235596,
"learning_rate": 0.00028079336235003674,
"loss": 1.8672,
"step": 51
},
{
"epoch": 0.3965681601525262,
"grad_norm": 0.3953985869884491,
"learning_rate": 0.0002799329510692108,
"loss": 1.7821,
"step": 52
},
{
"epoch": 0.4041944709246902,
"grad_norm": 0.34664496779441833,
"learning_rate": 0.0002790552313087104,
"loss": 1.8357,
"step": 53
},
{
"epoch": 0.41182078169685415,
"grad_norm": 0.32160669565200806,
"learning_rate": 0.0002781603352499031,
"loss": 1.8114,
"step": 54
},
{
"epoch": 0.4194470924690181,
"grad_norm": 0.35647809505462646,
"learning_rate": 0.0002772483976608436,
"loss": 1.9044,
"step": 55
},
{
"epoch": 0.4270734032411821,
"grad_norm": 0.5951141715049744,
"learning_rate": 0.0002763195558759784,
"loss": 1.8211,
"step": 56
},
{
"epoch": 0.43469971401334606,
"grad_norm": 0.5895670652389526,
"learning_rate": 0.00027537394977546377,
"loss": 1.8736,
"step": 57
},
{
"epoch": 0.44232602478551003,
"grad_norm": 0.3808022439479828,
"learning_rate": 0.00027441172176410027,
"loss": 1.8487,
"step": 58
},
{
"epoch": 0.449952335557674,
"grad_norm": 0.3616071343421936,
"learning_rate": 0.000273433016749887,
"loss": 1.86,
"step": 59
},
{
"epoch": 0.45757864632983797,
"grad_norm": 0.34098148345947266,
"learning_rate": 0.00027243798212219926,
"loss": 1.8849,
"step": 60
},
{
"epoch": 0.4652049571020019,
"grad_norm": 0.3677496910095215,
"learning_rate": 0.0002714267677295918,
"loss": 1.9066,
"step": 61
},
{
"epoch": 0.47283126787416585,
"grad_norm": 0.42355066537857056,
"learning_rate": 0.0002703995258572327,
"loss": 1.8943,
"step": 62
},
{
"epoch": 0.4804575786463298,
"grad_norm": 0.8111021518707275,
"learning_rate": 0.0002693564112039695,
"loss": 1.8816,
"step": 63
},
{
"epoch": 0.4880838894184938,
"grad_norm": 0.5244035720825195,
"learning_rate": 0.00026829758085903196,
"loss": 1.7596,
"step": 64
},
{
"epoch": 0.49571020019065776,
"grad_norm": 0.49128928780555725,
"learning_rate": 0.0002672231942783754,
"loss": 1.7882,
"step": 65
},
{
"epoch": 0.5033365109628217,
"grad_norm": 0.35969385504722595,
"learning_rate": 0.000266133413260667,
"loss": 1.7607,
"step": 66
},
{
"epoch": 0.5109628217349858,
"grad_norm": 0.30560654401779175,
"learning_rate": 0.0002650284019229195,
"loss": 1.7378,
"step": 67
},
{
"epoch": 0.5185891325071497,
"grad_norm": 0.2821063697338104,
"learning_rate": 0.0002639083266757757,
"loss": 1.7452,
"step": 68
},
{
"epoch": 0.5262154432793136,
"grad_norm": 0.30434897541999817,
"learning_rate": 0.000262773356198448,
"loss": 1.757,
"step": 69
},
{
"epoch": 0.5338417540514776,
"grad_norm": 0.35866251587867737,
"learning_rate": 0.0002616236614133155,
"loss": 1.8456,
"step": 70
},
{
"epoch": 0.5414680648236415,
"grad_norm": 0.36527636647224426,
"learning_rate": 0.0002604594154601839,
"loss": 1.7636,
"step": 71
},
{
"epoch": 0.5490943755958055,
"grad_norm": 0.4293094575405121,
"learning_rate": 0.00025928079367021134,
"loss": 1.7983,
"step": 72
},
{
"epoch": 0.5567206863679695,
"grad_norm": 0.4558382034301758,
"learning_rate": 0.000258087973539504,
"loss": 1.8167,
"step": 73
},
{
"epoch": 0.5643469971401335,
"grad_norm": 0.46237555146217346,
"learning_rate": 0.00025688113470238616,
"loss": 1.8516,
"step": 74
},
{
"epoch": 0.5719733079122974,
"grad_norm": 0.5830234885215759,
"learning_rate": 0.00025566045890434747,
"loss": 1.8979,
"step": 75
},
{
"epoch": 0.5719733079122974,
"eval_loss": 1.8215560913085938,
"eval_runtime": 0.2804,
"eval_samples_per_second": 178.348,
"eval_steps_per_second": 46.371,
"step": 75
},
{
"epoch": 0.5795996186844614,
"grad_norm": 0.4658236503601074,
"learning_rate": 0.00025442612997467315,
"loss": 1.7275,
"step": 76
},
{
"epoch": 0.5872259294566253,
"grad_norm": 0.45749977231025696,
"learning_rate": 0.0002531783337987598,
"loss": 1.7482,
"step": 77
},
{
"epoch": 0.5948522402287894,
"grad_norm": 0.4462372660636902,
"learning_rate": 0.0002519172582901218,
"loss": 1.7561,
"step": 78
},
{
"epoch": 0.6024785510009533,
"grad_norm": 0.38078993558883667,
"learning_rate": 0.00025064309336209214,
"loss": 1.7398,
"step": 79
},
{
"epoch": 0.6101048617731173,
"grad_norm": 0.3360918164253235,
"learning_rate": 0.00024935603089922215,
"loss": 1.7546,
"step": 80
},
{
"epoch": 0.6177311725452812,
"grad_norm": 0.30906444787979126,
"learning_rate": 0.0002480562647283846,
"loss": 1.7487,
"step": 81
},
{
"epoch": 0.6253574833174452,
"grad_norm": 0.36905303597450256,
"learning_rate": 0.00024674399058958394,
"loss": 1.7589,
"step": 82
},
{
"epoch": 0.6329837940896091,
"grad_norm": 0.3760243058204651,
"learning_rate": 0.0002454194061064785,
"loss": 1.7732,
"step": 83
},
{
"epoch": 0.6406101048617732,
"grad_norm": 0.41075772047042847,
"learning_rate": 0.0002440827107566192,
"loss": 1.7812,
"step": 84
},
{
"epoch": 0.6482364156339371,
"grad_norm": 0.41905277967453003,
"learning_rate": 0.00024273410584140913,
"loss": 1.7692,
"step": 85
},
{
"epoch": 0.655862726406101,
"grad_norm": 0.41411152482032776,
"learning_rate": 0.00024137379445578774,
"loss": 1.8508,
"step": 86
},
{
"epoch": 0.663489037178265,
"grad_norm": 0.47408804297447205,
"learning_rate": 0.0002400019814576463,
"loss": 1.847,
"step": 87
},
{
"epoch": 0.6711153479504289,
"grad_norm": 0.4235435724258423,
"learning_rate": 0.00023861887343697624,
"loss": 1.8122,
"step": 88
},
{
"epoch": 0.678741658722593,
"grad_norm": 0.3859807252883911,
"learning_rate": 0.00023722467868475812,
"loss": 1.6975,
"step": 89
},
{
"epoch": 0.6863679694947569,
"grad_norm": 0.3468429148197174,
"learning_rate": 0.0002358196071615933,
"loss": 1.694,
"step": 90
},
{
"epoch": 0.6939942802669209,
"grad_norm": 0.3533165156841278,
"learning_rate": 0.00023440387046608487,
"loss": 1.6882,
"step": 91
},
{
"epoch": 0.7016205910390848,
"grad_norm": 0.3266445994377136,
"learning_rate": 0.00023297768180297187,
"loss": 1.6909,
"step": 92
},
{
"epoch": 0.7092469018112488,
"grad_norm": 0.3379365801811218,
"learning_rate": 0.00023154125595102083,
"loss": 1.7055,
"step": 93
},
{
"epoch": 0.7168732125834127,
"grad_norm": 0.32845547795295715,
"learning_rate": 0.00023009480923068157,
"loss": 1.7529,
"step": 94
},
{
"epoch": 0.7244995233555768,
"grad_norm": 0.3393206000328064,
"learning_rate": 0.00022863855947150968,
"loss": 1.7702,
"step": 95
},
{
"epoch": 0.7321258341277407,
"grad_norm": 0.3422520160675049,
"learning_rate": 0.0002271727259793624,
"loss": 1.7063,
"step": 96
},
{
"epoch": 0.7397521448999047,
"grad_norm": 0.378302663564682,
"learning_rate": 0.0002256975295033719,
"loss": 1.7602,
"step": 97
},
{
"epoch": 0.7473784556720686,
"grad_norm": 0.41780129075050354,
"learning_rate": 0.0002242131922027012,
"loss": 1.8039,
"step": 98
},
{
"epoch": 0.7550047664442326,
"grad_norm": 0.44365987181663513,
"learning_rate": 0.00022271993761308807,
"loss": 1.7738,
"step": 99
},
{
"epoch": 0.7626310772163966,
"grad_norm": 0.5752303600311279,
"learning_rate": 0.00022121799061318104,
"loss": 1.9044,
"step": 100
},
{
"epoch": 0.7626310772163966,
"eval_loss": 1.7645323276519775,
"eval_runtime": 0.28,
"eval_samples_per_second": 178.583,
"eval_steps_per_second": 46.431,
"step": 100
},
{
"epoch": 0.7702573879885606,
"grad_norm": 0.5027046203613281,
"learning_rate": 0.00021970757739067358,
"loss": 1.6627,
"step": 101
},
{
"epoch": 0.7778836987607245,
"grad_norm": 0.4292294681072235,
"learning_rate": 0.00021818892540824148,
"loss": 1.6495,
"step": 102
},
{
"epoch": 0.7855100095328885,
"grad_norm": 0.37569937109947205,
"learning_rate": 0.00021666226336928708,
"loss": 1.6692,
"step": 103
},
{
"epoch": 0.7931363203050524,
"grad_norm": 0.3786998391151428,
"learning_rate": 0.00021512782118349806,
"loss": 1.6581,
"step": 104
},
{
"epoch": 0.8007626310772163,
"grad_norm": 0.36832037568092346,
"learning_rate": 0.0002135858299322234,
"loss": 1.6714,
"step": 105
},
{
"epoch": 0.8083889418493804,
"grad_norm": 0.336896687746048,
"learning_rate": 0.00021203652183367363,
"loss": 1.7207,
"step": 106
},
{
"epoch": 0.8160152526215443,
"grad_norm": 0.35588911175727844,
"learning_rate": 0.00021048013020794968,
"loss": 1.7085,
"step": 107
},
{
"epoch": 0.8236415633937083,
"grad_norm": 0.4056905508041382,
"learning_rate": 0.00020891688944190548,
"loss": 1.7094,
"step": 108
},
{
"epoch": 0.8312678741658722,
"grad_norm": 0.40606462955474854,
"learning_rate": 0.00020734703495385037,
"loss": 1.7239,
"step": 109
},
{
"epoch": 0.8388941849380362,
"grad_norm": 0.40182796120643616,
"learning_rate": 0.0002057708031580958,
"loss": 1.7333,
"step": 110
},
{
"epoch": 0.8465204957102002,
"grad_norm": 0.44995224475860596,
"learning_rate": 0.00020418843142935237,
"loss": 1.7049,
"step": 111
},
{
"epoch": 0.8541468064823642,
"grad_norm": 0.4388667047023773,
"learning_rate": 0.00020260015806698213,
"loss": 1.783,
"step": 112
},
{
"epoch": 0.8617731172545281,
"grad_norm": 0.44865381717681885,
"learning_rate": 0.00020100622225911128,
"loss": 1.7508,
"step": 113
},
{
"epoch": 0.8693994280266921,
"grad_norm": 0.4294881522655487,
"learning_rate": 0.00019940686404660947,
"loss": 1.6571,
"step": 114
},
{
"epoch": 0.877025738798856,
"grad_norm": 0.3841921091079712,
"learning_rate": 0.00019780232428694063,
"loss": 1.695,
"step": 115
},
{
"epoch": 0.8846520495710201,
"grad_norm": 0.3785157799720764,
"learning_rate": 0.0001961928446178906,
"loss": 1.6545,
"step": 116
},
{
"epoch": 0.892278360343184,
"grad_norm": 0.33887144923210144,
"learning_rate": 0.00019457866742117737,
"loss": 1.6715,
"step": 117
},
{
"epoch": 0.899904671115348,
"grad_norm": 0.32863056659698486,
"learning_rate": 0.00019296003578594948,
"loss": 1.6952,
"step": 118
},
{
"epoch": 0.9075309818875119,
"grad_norm": 0.3387095034122467,
"learning_rate": 0.00019133719347217733,
"loss": 1.6291,
"step": 119
},
{
"epoch": 0.9151572926596759,
"grad_norm": 0.3665367066860199,
"learning_rate": 0.00018971038487394402,
"loss": 1.7321,
"step": 120
},
{
"epoch": 0.9227836034318398,
"grad_norm": 0.3495505452156067,
"learning_rate": 0.00018807985498264066,
"loss": 1.6587,
"step": 121
},
{
"epoch": 0.9304099142040038,
"grad_norm": 0.40355828404426575,
"learning_rate": 0.00018644584935007127,
"loss": 1.7027,
"step": 122
},
{
"epoch": 0.9380362249761678,
"grad_norm": 0.4357530474662781,
"learning_rate": 0.0001848086140514738,
"loss": 1.7724,
"step": 123
},
{
"epoch": 0.9456625357483317,
"grad_norm": 0.44958412647247314,
"learning_rate": 0.000183168395648462,
"loss": 1.7454,
"step": 124
},
{
"epoch": 0.9532888465204957,
"grad_norm": 0.5644071102142334,
"learning_rate": 0.00018152544115189416,
"loss": 1.8156,
"step": 125
},
{
"epoch": 0.9532888465204957,
"eval_loss": 1.7262933254241943,
"eval_runtime": 0.2797,
"eval_samples_per_second": 178.792,
"eval_steps_per_second": 46.486,
"step": 125
},
{
"epoch": 0.9609151572926596,
"grad_norm": 0.45155230164527893,
"learning_rate": 0.0001798799979846742,
"loss": 1.6338,
"step": 126
},
{
"epoch": 0.9685414680648237,
"grad_norm": 0.41884180903434753,
"learning_rate": 0.00017823231394449072,
"loss": 1.6829,
"step": 127
},
{
"epoch": 0.9761677788369876,
"grad_norm": 0.3723445534706116,
"learning_rate": 0.0001765826371664994,
"loss": 1.6707,
"step": 128
},
{
"epoch": 0.9837940896091516,
"grad_norm": 0.3832674026489258,
"learning_rate": 0.00017493121608595511,
"loss": 1.7397,
"step": 129
},
{
"epoch": 0.9914204003813155,
"grad_norm": 0.37408822774887085,
"learning_rate": 0.00017327829940079817,
"loss": 1.6765,
"step": 130
},
{
"epoch": 0.9990467111534795,
"grad_norm": 0.42527204751968384,
"learning_rate": 0.00017162413603420142,
"loss": 1.791,
"step": 131
},
{
"epoch": 1.0066730219256435,
"grad_norm": 1.3834341764450073,
"learning_rate": 0.00016996897509708345,
"loss": 3.4039,
"step": 132
},
{
"epoch": 1.0142993326978074,
"grad_norm": 0.43991145491600037,
"learning_rate": 0.00016831306585059317,
"loss": 1.6506,
"step": 133
},
{
"epoch": 1.0219256434699715,
"grad_norm": 0.37817224860191345,
"learning_rate": 0.0001666566576685722,
"loss": 1.5943,
"step": 134
},
{
"epoch": 1.0295519542421354,
"grad_norm": 0.34133297204971313,
"learning_rate": 0.000165,
"loss": 1.5756,
"step": 135
},
{
"epoch": 1.0371782650142993,
"grad_norm": 0.32997646927833557,
"learning_rate": 0.0001633433423314278,
"loss": 1.6236,
"step": 136
},
{
"epoch": 1.0448045757864632,
"grad_norm": 0.3591316342353821,
"learning_rate": 0.00016168693414940683,
"loss": 1.6221,
"step": 137
},
{
"epoch": 1.0524308865586272,
"grad_norm": 0.3701683580875397,
"learning_rate": 0.00016003102490291655,
"loss": 1.6099,
"step": 138
},
{
"epoch": 1.0600571973307913,
"grad_norm": 0.3921620845794678,
"learning_rate": 0.00015837586396579858,
"loss": 1.6507,
"step": 139
},
{
"epoch": 1.0676835081029552,
"grad_norm": 0.411425918340683,
"learning_rate": 0.00015672170059920183,
"loss": 1.6658,
"step": 140
},
{
"epoch": 1.0753098188751191,
"grad_norm": 0.4283716082572937,
"learning_rate": 0.00015506878391404488,
"loss": 1.6525,
"step": 141
},
{
"epoch": 1.082936129647283,
"grad_norm": 0.43306857347488403,
"learning_rate": 0.00015341736283350064,
"loss": 1.6808,
"step": 142
},
{
"epoch": 1.0905624404194472,
"grad_norm": 0.46868017315864563,
"learning_rate": 0.0001517676860555093,
"loss": 1.7022,
"step": 143
},
{
"epoch": 1.098188751191611,
"grad_norm": 0.3939070403575897,
"learning_rate": 0.0001501200020153258,
"loss": 1.608,
"step": 144
},
{
"epoch": 1.105815061963775,
"grad_norm": 0.5079172253608704,
"learning_rate": 0.00014847455884810581,
"loss": 1.664,
"step": 145
},
{
"epoch": 1.113441372735939,
"grad_norm": 0.4701906740665436,
"learning_rate": 0.00014683160435153796,
"loss": 1.5924,
"step": 146
},
{
"epoch": 1.121067683508103,
"grad_norm": 0.4438985288143158,
"learning_rate": 0.00014519138594852615,
"loss": 1.6186,
"step": 147
},
{
"epoch": 1.128693994280267,
"grad_norm": 0.40085965394973755,
"learning_rate": 0.00014355415064992873,
"loss": 1.6421,
"step": 148
},
{
"epoch": 1.1363203050524309,
"grad_norm": 0.3875059485435486,
"learning_rate": 0.00014192014501735934,
"loss": 1.5903,
"step": 149
},
{
"epoch": 1.1439466158245948,
"grad_norm": 0.3794068992137909,
"learning_rate": 0.00014028961512605598,
"loss": 1.6741,
"step": 150
},
{
"epoch": 1.1439466158245948,
"eval_loss": 1.7033360004425049,
"eval_runtime": 0.2798,
"eval_samples_per_second": 178.678,
"eval_steps_per_second": 46.456,
"step": 150
},
{
"epoch": 1.1515729265967587,
"grad_norm": 0.38320392370224,
"learning_rate": 0.00013866280652782267,
"loss": 1.6258,
"step": 151
},
{
"epoch": 1.1591992373689228,
"grad_norm": 0.3931479752063751,
"learning_rate": 0.00013703996421405052,
"loss": 1.6313,
"step": 152
},
{
"epoch": 1.1668255481410867,
"grad_norm": 0.4570615589618683,
"learning_rate": 0.00013542133257882257,
"loss": 1.6801,
"step": 153
},
{
"epoch": 1.1744518589132507,
"grad_norm": 0.47714921832084656,
"learning_rate": 0.0001338071553821094,
"loss": 1.6307,
"step": 154
},
{
"epoch": 1.1820781696854148,
"grad_norm": 0.4591121971607208,
"learning_rate": 0.00013219767571305937,
"loss": 1.7064,
"step": 155
},
{
"epoch": 1.1897044804575787,
"grad_norm": 0.5732882022857666,
"learning_rate": 0.00013059313595339053,
"loss": 1.7405,
"step": 156
},
{
"epoch": 1.1973307912297426,
"grad_norm": 0.4108792543411255,
"learning_rate": 0.00012899377774088872,
"loss": 1.6063,
"step": 157
},
{
"epoch": 1.2049571020019065,
"grad_norm": 0.42478087544441223,
"learning_rate": 0.00012739984193301784,
"loss": 1.5782,
"step": 158
},
{
"epoch": 1.2125834127740704,
"grad_norm": 0.44489574432373047,
"learning_rate": 0.0001258115685706476,
"loss": 1.5959,
"step": 159
},
{
"epoch": 1.2202097235462346,
"grad_norm": 0.41875404119491577,
"learning_rate": 0.0001242291968419042,
"loss": 1.6163,
"step": 160
},
{
"epoch": 1.2278360343183985,
"grad_norm": 0.3827251195907593,
"learning_rate": 0.00012265296504614963,
"loss": 1.6228,
"step": 161
},
{
"epoch": 1.2354623450905624,
"grad_norm": 0.3817841112613678,
"learning_rate": 0.0001210831105580945,
"loss": 1.5694,
"step": 162
},
{
"epoch": 1.2430886558627263,
"grad_norm": 0.3716193735599518,
"learning_rate": 0.00011951986979205029,
"loss": 1.6367,
"step": 163
},
{
"epoch": 1.2507149666348902,
"grad_norm": 0.37516316771507263,
"learning_rate": 0.00011796347816632634,
"loss": 1.6157,
"step": 164
},
{
"epoch": 1.2583412774070544,
"grad_norm": 0.4124738276004791,
"learning_rate": 0.00011641417006777658,
"loss": 1.5697,
"step": 165
},
{
"epoch": 1.2659675881792183,
"grad_norm": 0.4279733896255493,
"learning_rate": 0.00011487217881650195,
"loss": 1.6447,
"step": 166
},
{
"epoch": 1.2735938989513822,
"grad_norm": 0.4881094992160797,
"learning_rate": 0.00011333773663071288,
"loss": 1.6122,
"step": 167
},
{
"epoch": 1.2812202097235463,
"grad_norm": 0.521618127822876,
"learning_rate": 0.00011181107459175851,
"loss": 1.7202,
"step": 168
},
{
"epoch": 1.2888465204957102,
"grad_norm": 0.3777306079864502,
"learning_rate": 0.00011029242260932638,
"loss": 1.5756,
"step": 169
},
{
"epoch": 1.2964728312678742,
"grad_norm": 0.4733025133609772,
"learning_rate": 0.000108782009386819,
"loss": 1.6479,
"step": 170
},
{
"epoch": 1.304099142040038,
"grad_norm": 0.41860291361808777,
"learning_rate": 0.00010728006238691194,
"loss": 1.5983,
"step": 171
},
{
"epoch": 1.311725452812202,
"grad_norm": 0.46844813227653503,
"learning_rate": 0.00010578680779729879,
"loss": 1.578,
"step": 172
},
{
"epoch": 1.3193517635843661,
"grad_norm": 0.40656524896621704,
"learning_rate": 0.0001043024704966281,
"loss": 1.6255,
"step": 173
},
{
"epoch": 1.32697807435653,
"grad_norm": 0.38256990909576416,
"learning_rate": 0.00010282727402063758,
"loss": 1.5675,
"step": 174
},
{
"epoch": 1.334604385128694,
"grad_norm": 0.3779941201210022,
"learning_rate": 0.00010136144052849031,
"loss": 1.5789,
"step": 175
},
{
"epoch": 1.334604385128694,
"eval_loss": 1.695977807044983,
"eval_runtime": 0.2808,
"eval_samples_per_second": 178.081,
"eval_steps_per_second": 46.301,
"step": 175
},
{
"epoch": 1.342230695900858,
"grad_norm": 0.3875720798969269,
"learning_rate": 9.990519076931843e-05,
"loss": 1.656,
"step": 176
},
{
"epoch": 1.349857006673022,
"grad_norm": 0.38068655133247375,
"learning_rate": 9.845874404897915e-05,
"loss": 1.623,
"step": 177
},
{
"epoch": 1.357483317445186,
"grad_norm": 0.4605511724948883,
"learning_rate": 9.702231819702814e-05,
"loss": 1.627,
"step": 178
},
{
"epoch": 1.3651096282173498,
"grad_norm": 0.4176296889781952,
"learning_rate": 9.559612953391507e-05,
"loss": 1.6706,
"step": 179
},
{
"epoch": 1.3727359389895137,
"grad_norm": 0.4767864942550659,
"learning_rate": 9.418039283840671e-05,
"loss": 1.6709,
"step": 180
},
{
"epoch": 1.3803622497616779,
"grad_norm": 0.567336916923523,
"learning_rate": 9.27753213152419e-05,
"loss": 1.8214,
"step": 181
},
{
"epoch": 1.3879885605338418,
"grad_norm": 0.4051623046398163,
"learning_rate": 9.138112656302376e-05,
"loss": 1.6248,
"step": 182
},
{
"epoch": 1.3956148713060057,
"grad_norm": 0.380164235830307,
"learning_rate": 8.999801854235373e-05,
"loss": 1.5668,
"step": 183
},
{
"epoch": 1.4032411820781696,
"grad_norm": 0.37853559851646423,
"learning_rate": 8.862620554421221e-05,
"loss": 1.6079,
"step": 184
},
{
"epoch": 1.4108674928503335,
"grad_norm": 0.38022464513778687,
"learning_rate": 8.726589415859088e-05,
"loss": 1.6109,
"step": 185
},
{
"epoch": 1.4184938036224977,
"grad_norm": 0.3726414442062378,
"learning_rate": 8.591728924338075e-05,
"loss": 1.5726,
"step": 186
},
{
"epoch": 1.4261201143946616,
"grad_norm": 0.39313751459121704,
"learning_rate": 8.45805938935215e-05,
"loss": 1.5881,
"step": 187
},
{
"epoch": 1.4337464251668255,
"grad_norm": 0.40941789746284485,
"learning_rate": 8.325600941041607e-05,
"loss": 1.6375,
"step": 188
},
{
"epoch": 1.4413727359389896,
"grad_norm": 0.38843002915382385,
"learning_rate": 8.194373527161539e-05,
"loss": 1.5911,
"step": 189
},
{
"epoch": 1.4489990467111535,
"grad_norm": 0.38351744413375854,
"learning_rate": 8.064396910077785e-05,
"loss": 1.6153,
"step": 190
},
{
"epoch": 1.4566253574833175,
"grad_norm": 0.42547914385795593,
"learning_rate": 7.935690663790787e-05,
"loss": 1.5872,
"step": 191
},
{
"epoch": 1.4642516682554814,
"grad_norm": 0.45052269101142883,
"learning_rate": 7.808274170987818e-05,
"loss": 1.6048,
"step": 192
},
{
"epoch": 1.4718779790276453,
"grad_norm": 0.5102285742759705,
"learning_rate": 7.682166620124017e-05,
"loss": 1.6611,
"step": 193
},
{
"epoch": 1.4795042897998094,
"grad_norm": 0.3973918855190277,
"learning_rate": 7.55738700253268e-05,
"loss": 1.6591,
"step": 194
},
{
"epoch": 1.4871306005719733,
"grad_norm": 0.4221573770046234,
"learning_rate": 7.43395410956525e-05,
"loss": 1.5788,
"step": 195
},
{
"epoch": 1.4947569113441372,
"grad_norm": 0.38037997484207153,
"learning_rate": 7.311886529761383e-05,
"loss": 1.543,
"step": 196
},
{
"epoch": 1.5023832221163014,
"grad_norm": 0.3961423635482788,
"learning_rate": 7.191202646049596e-05,
"loss": 1.5559,
"step": 197
},
{
"epoch": 1.510009532888465,
"grad_norm": 0.40945950150489807,
"learning_rate": 7.071920632978867e-05,
"loss": 1.6016,
"step": 198
},
{
"epoch": 1.5176358436606292,
"grad_norm": 0.3885650038719177,
"learning_rate": 6.954058453981609e-05,
"loss": 1.587,
"step": 199
},
{
"epoch": 1.5252621544327931,
"grad_norm": 0.4135299623012543,
"learning_rate": 6.837633858668448e-05,
"loss": 1.6103,
"step": 200
},
{
"epoch": 1.5252621544327931,
"eval_loss": 1.6848325729370117,
"eval_runtime": 0.2814,
"eval_samples_per_second": 177.685,
"eval_steps_per_second": 46.198,
"step": 200
},
{
"epoch": 1.532888465204957,
"grad_norm": 0.382758229970932,
"learning_rate": 6.722664380155198e-05,
"loss": 1.6259,
"step": 201
},
{
"epoch": 1.5405147759771212,
"grad_norm": 0.3803237974643707,
"learning_rate": 6.609167332422427e-05,
"loss": 1.5547,
"step": 202
},
{
"epoch": 1.548141086749285,
"grad_norm": 0.4076535701751709,
"learning_rate": 6.497159807708055e-05,
"loss": 1.5846,
"step": 203
},
{
"epoch": 1.555767397521449,
"grad_norm": 0.41619113087654114,
"learning_rate": 6.386658673933301e-05,
"loss": 1.6648,
"step": 204
},
{
"epoch": 1.5633937082936131,
"grad_norm": 0.4555559754371643,
"learning_rate": 6.277680572162459e-05,
"loss": 1.6636,
"step": 205
},
{
"epoch": 1.5710200190657768,
"grad_norm": 0.5715373158454895,
"learning_rate": 6.170241914096804e-05,
"loss": 1.7265,
"step": 206
},
{
"epoch": 1.578646329837941,
"grad_norm": 0.39986926317214966,
"learning_rate": 6.06435887960305e-05,
"loss": 1.614,
"step": 207
},
{
"epoch": 1.5862726406101049,
"grad_norm": 0.4014238119125366,
"learning_rate": 5.960047414276724e-05,
"loss": 1.5169,
"step": 208
},
{
"epoch": 1.5938989513822688,
"grad_norm": 0.40676450729370117,
"learning_rate": 5.857323227040816e-05,
"loss": 1.5836,
"step": 209
},
{
"epoch": 1.601525262154433,
"grad_norm": 0.38130614161491394,
"learning_rate": 5.756201787780074e-05,
"loss": 1.5636,
"step": 210
},
{
"epoch": 1.6091515729265966,
"grad_norm": 0.4001769721508026,
"learning_rate": 5.656698325011295e-05,
"loss": 1.5641,
"step": 211
},
{
"epoch": 1.6167778836987607,
"grad_norm": 0.3762960433959961,
"learning_rate": 5.5588278235899724e-05,
"loss": 1.615,
"step": 212
},
{
"epoch": 1.6244041944709247,
"grad_norm": 0.372916042804718,
"learning_rate": 5.462605022453621e-05,
"loss": 1.6307,
"step": 213
},
{
"epoch": 1.6320305052430886,
"grad_norm": 0.4022381603717804,
"learning_rate": 5.368044412402161e-05,
"loss": 1.5634,
"step": 214
},
{
"epoch": 1.6396568160152527,
"grad_norm": 0.39602571725845337,
"learning_rate": 5.275160233915637e-05,
"loss": 1.6328,
"step": 215
},
{
"epoch": 1.6472831267874166,
"grad_norm": 0.4366743266582489,
"learning_rate": 5.183966475009686e-05,
"loss": 1.6038,
"step": 216
},
{
"epoch": 1.6549094375595805,
"grad_norm": 0.458132803440094,
"learning_rate": 5.0944768691289534e-05,
"loss": 1.6384,
"step": 217
},
{
"epoch": 1.6625357483317447,
"grad_norm": 0.4853437840938568,
"learning_rate": 5.0067048930789196e-05,
"loss": 1.6787,
"step": 218
},
{
"epoch": 1.6701620591039084,
"grad_norm": 0.4034062325954437,
"learning_rate": 4.920663764996328e-05,
"loss": 1.5721,
"step": 219
},
{
"epoch": 1.6777883698760725,
"grad_norm": 0.4523112177848816,
"learning_rate": 4.8363664423585795e-05,
"loss": 1.6327,
"step": 220
},
{
"epoch": 1.6854146806482364,
"grad_norm": 0.4193178415298462,
"learning_rate": 4.753825620032397e-05,
"loss": 1.5354,
"step": 221
},
{
"epoch": 1.6930409914204003,
"grad_norm": 0.3819790482521057,
"learning_rate": 4.673053728362012e-05,
"loss": 1.5833,
"step": 222
},
{
"epoch": 1.7006673021925645,
"grad_norm": 0.3742893636226654,
"learning_rate": 4.5940629312972085e-05,
"loss": 1.5805,
"step": 223
},
{
"epoch": 1.7082936129647281,
"grad_norm": 0.37698328495025635,
"learning_rate": 4.516865124561473e-05,
"loss": 1.5632,
"step": 224
},
{
"epoch": 1.7159199237368923,
"grad_norm": 0.39410701394081116,
"learning_rate": 4.4414719338605445e-05,
"loss": 1.6016,
"step": 225
},
{
"epoch": 1.7159199237368923,
"eval_loss": 1.6763724088668823,
"eval_runtime": 0.281,
"eval_samples_per_second": 177.963,
"eval_steps_per_second": 46.27,
"step": 225
},
{
"epoch": 1.7235462345090562,
"grad_norm": 0.41081416606903076,
"learning_rate": 4.367894713131622e-05,
"loss": 1.5998,
"step": 226
},
{
"epoch": 1.73117254528122,
"grad_norm": 0.4001181721687317,
"learning_rate": 4.296144542833515e-05,
"loss": 1.6213,
"step": 227
},
{
"epoch": 1.7387988560533842,
"grad_norm": 0.4368586242198944,
"learning_rate": 4.226232228277948e-05,
"loss": 1.6338,
"step": 228
},
{
"epoch": 1.7464251668255482,
"grad_norm": 0.4104710817337036,
"learning_rate": 4.1581682980023354e-05,
"loss": 1.6433,
"step": 229
},
{
"epoch": 1.754051477597712,
"grad_norm": 0.4876128137111664,
"learning_rate": 4.0919630021842204e-05,
"loss": 1.6381,
"step": 230
},
{
"epoch": 1.7616777883698762,
"grad_norm": 0.5875245332717896,
"learning_rate": 4.027626311097629e-05,
"loss": 1.7134,
"step": 231
},
{
"epoch": 1.76930409914204,
"grad_norm": 0.40997183322906494,
"learning_rate": 3.965167913611591e-05,
"loss": 1.5599,
"step": 232
},
{
"epoch": 1.776930409914204,
"grad_norm": 0.42695024609565735,
"learning_rate": 3.9045972157310256e-05,
"loss": 1.5685,
"step": 233
},
{
"epoch": 1.784556720686368,
"grad_norm": 0.4017082452774048,
"learning_rate": 3.845923339180239e-05,
"loss": 1.5493,
"step": 234
},
{
"epoch": 1.7921830314585319,
"grad_norm": 0.38347816467285156,
"learning_rate": 3.78915512002922e-05,
"loss": 1.5464,
"step": 235
},
{
"epoch": 1.799809342230696,
"grad_norm": 0.40365535020828247,
"learning_rate": 3.734301107362964e-05,
"loss": 1.6257,
"step": 236
},
{
"epoch": 1.80743565300286,
"grad_norm": 0.3748120963573456,
"learning_rate": 3.681369561994005e-05,
"loss": 1.5456,
"step": 237
},
{
"epoch": 1.8150619637750238,
"grad_norm": 0.36911335587501526,
"learning_rate": 3.6303684552183827e-05,
"loss": 1.5886,
"step": 238
},
{
"epoch": 1.822688274547188,
"grad_norm": 0.3796713948249817,
"learning_rate": 3.581305467615181e-05,
"loss": 1.5858,
"step": 239
},
{
"epoch": 1.8303145853193517,
"grad_norm": 0.4003843665122986,
"learning_rate": 3.5341879878898615e-05,
"loss": 1.6126,
"step": 240
},
{
"epoch": 1.8379408960915158,
"grad_norm": 0.4427483379840851,
"learning_rate": 3.489023111761562e-05,
"loss": 1.6487,
"step": 241
},
{
"epoch": 1.8455672068636797,
"grad_norm": 0.4299188554286957,
"learning_rate": 3.445817640894497e-05,
"loss": 1.6723,
"step": 242
},
{
"epoch": 1.8531935176358436,
"grad_norm": 0.5241357684135437,
"learning_rate": 3.404578081873656e-05,
"loss": 1.6198,
"step": 243
},
{
"epoch": 1.8608198284080077,
"grad_norm": 0.40499526262283325,
"learning_rate": 3.365310645224939e-05,
"loss": 1.5758,
"step": 244
},
{
"epoch": 1.8684461391801714,
"grad_norm": 0.43495872616767883,
"learning_rate": 3.328021244479866e-05,
"loss": 1.5897,
"step": 245
},
{
"epoch": 1.8760724499523356,
"grad_norm": 0.39778581261634827,
"learning_rate": 3.292715495285028e-05,
"loss": 1.5267,
"step": 246
},
{
"epoch": 1.8836987607244995,
"grad_norm": 0.3671037256717682,
"learning_rate": 3.259398714556389e-05,
"loss": 1.5499,
"step": 247
},
{
"epoch": 1.8913250714966634,
"grad_norm": 0.38671308755874634,
"learning_rate": 3.2280759196785803e-05,
"loss": 1.628,
"step": 248
},
{
"epoch": 1.8989513822688275,
"grad_norm": 0.41623926162719727,
"learning_rate": 3.1987518277492934e-05,
"loss": 1.5699,
"step": 249
},
{
"epoch": 1.9065776930409915,
"grad_norm": 0.38484105467796326,
"learning_rate": 3.171430854868911e-05,
"loss": 1.5702,
"step": 250
},
{
"epoch": 1.9065776930409915,
"eval_loss": 1.6708096265792847,
"eval_runtime": 0.2807,
"eval_samples_per_second": 178.136,
"eval_steps_per_second": 46.315,
"step": 250
}
],
"logging_steps": 1,
"max_steps": 263,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.11070068867072e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}