gemma7b-gpt4o_100k_coding-lora / trainer_state.json
chansung's picture
Model save
c3e1685 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999581764951903,
"eval_steps": 500,
"global_step": 1195,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000836470096194061,
"grad_norm": 109.33856201171875,
"learning_rate": 2.4999999999999998e-06,
"loss": 24.0328,
"step": 1
},
{
"epoch": 0.004182350480970306,
"grad_norm": 97.91401672363281,
"learning_rate": 1.2499999999999999e-05,
"loss": 22.4374,
"step": 5
},
{
"epoch": 0.008364700961940611,
"grad_norm": 36.984405517578125,
"learning_rate": 2.4999999999999998e-05,
"loss": 20.9539,
"step": 10
},
{
"epoch": 0.012547051442910916,
"grad_norm": 20.562854766845703,
"learning_rate": 3.75e-05,
"loss": 18.0157,
"step": 15
},
{
"epoch": 0.016729401923881223,
"grad_norm": 7.916600227355957,
"learning_rate": 4.9999999999999996e-05,
"loss": 15.4241,
"step": 20
},
{
"epoch": 0.020911752404851526,
"grad_norm": 6.871030807495117,
"learning_rate": 6.25e-05,
"loss": 14.6159,
"step": 25
},
{
"epoch": 0.025094102885821833,
"grad_norm": 6.986784934997559,
"learning_rate": 7.5e-05,
"loss": 13.6058,
"step": 30
},
{
"epoch": 0.029276453366792136,
"grad_norm": 3.074172019958496,
"learning_rate": 8.75e-05,
"loss": 12.7264,
"step": 35
},
{
"epoch": 0.033458803847762446,
"grad_norm": 2.548049211502075,
"learning_rate": 9.999999999999999e-05,
"loss": 12.7985,
"step": 40
},
{
"epoch": 0.037641154328732745,
"grad_norm": 3.184255361557007,
"learning_rate": 0.0001125,
"loss": 12.2921,
"step": 45
},
{
"epoch": 0.04182350480970305,
"grad_norm": 4.834357738494873,
"learning_rate": 0.000125,
"loss": 11.7192,
"step": 50
},
{
"epoch": 0.04600585529067336,
"grad_norm": 8.06936264038086,
"learning_rate": 0.00013749999999999998,
"loss": 11.0815,
"step": 55
},
{
"epoch": 0.050188205771643665,
"grad_norm": 12.874434471130371,
"learning_rate": 0.00015,
"loss": 9.8728,
"step": 60
},
{
"epoch": 0.05437055625261397,
"grad_norm": 18.794525146484375,
"learning_rate": 0.00016249999999999997,
"loss": 7.7488,
"step": 65
},
{
"epoch": 0.05855290673358427,
"grad_norm": 21.42744255065918,
"learning_rate": 0.000175,
"loss": 4.8002,
"step": 70
},
{
"epoch": 0.06273525721455459,
"grad_norm": 7.021483898162842,
"learning_rate": 0.00018749999999999998,
"loss": 2.2667,
"step": 75
},
{
"epoch": 0.06691760769552489,
"grad_norm": 4.35729455947876,
"learning_rate": 0.00019999999999999998,
"loss": 1.7236,
"step": 80
},
{
"epoch": 0.07109995817649518,
"grad_norm": 2.531404972076416,
"learning_rate": 0.0002125,
"loss": 1.4388,
"step": 85
},
{
"epoch": 0.07528230865746549,
"grad_norm": 1.7126120328903198,
"learning_rate": 0.000225,
"loss": 1.2603,
"step": 90
},
{
"epoch": 0.0794646591384358,
"grad_norm": 1.713827133178711,
"learning_rate": 0.00023749999999999997,
"loss": 1.1149,
"step": 95
},
{
"epoch": 0.0836470096194061,
"grad_norm": 0.6418541073799133,
"learning_rate": 0.00025,
"loss": 1.0645,
"step": 100
},
{
"epoch": 0.08782936010037641,
"grad_norm": 0.9687772989273071,
"learning_rate": 0.0002625,
"loss": 1.0047,
"step": 105
},
{
"epoch": 0.09201171058134672,
"grad_norm": 0.8204954266548157,
"learning_rate": 0.00027499999999999996,
"loss": 0.986,
"step": 110
},
{
"epoch": 0.09619406106231702,
"grad_norm": 0.5046463012695312,
"learning_rate": 0.0002875,
"loss": 0.9229,
"step": 115
},
{
"epoch": 0.10037641154328733,
"grad_norm": 1.1709442138671875,
"learning_rate": 0.0003,
"loss": 0.9344,
"step": 120
},
{
"epoch": 0.10455876202425764,
"grad_norm": 0.9831328392028809,
"learning_rate": 0.0002999839868651235,
"loss": 0.8812,
"step": 125
},
{
"epoch": 0.10874111250522794,
"grad_norm": 1.4019944667816162,
"learning_rate": 0.0002999359508794339,
"loss": 0.8814,
"step": 130
},
{
"epoch": 0.11292346298619825,
"grad_norm": 0.78233802318573,
"learning_rate": 0.00029985590229902073,
"loss": 0.8701,
"step": 135
},
{
"epoch": 0.11710581346716854,
"grad_norm": 1.4517076015472412,
"learning_rate": 0.0002997438582149335,
"loss": 0.8753,
"step": 140
},
{
"epoch": 0.12128816394813885,
"grad_norm": 2.320331335067749,
"learning_rate": 0.0002995998425495327,
"loss": 0.8464,
"step": 145
},
{
"epoch": 0.12547051442910917,
"grad_norm": 1.0486135482788086,
"learning_rate": 0.000299423886051382,
"loss": 0.8498,
"step": 150
},
{
"epoch": 0.12965286491007946,
"grad_norm": 0.9131810069084167,
"learning_rate": 0.0002992160262886831,
"loss": 0.8468,
"step": 155
},
{
"epoch": 0.13383521539104978,
"grad_norm": 1.4284504652023315,
"learning_rate": 0.0002989763076412549,
"loss": 0.8088,
"step": 160
},
{
"epoch": 0.13801756587202008,
"grad_norm": 0.6106438040733337,
"learning_rate": 0.000298704781291058,
"loss": 0.8215,
"step": 165
},
{
"epoch": 0.14219991635299037,
"grad_norm": 0.5358127951622009,
"learning_rate": 0.0002984015052112665,
"loss": 0.8201,
"step": 170
},
{
"epoch": 0.1463822668339607,
"grad_norm": 1.5017443895339966,
"learning_rate": 0.0002980665441538907,
"loss": 0.7957,
"step": 175
},
{
"epoch": 0.15056461731493098,
"grad_norm": 1.1214942932128906,
"learning_rate": 0.00029769996963595184,
"loss": 0.8083,
"step": 180
},
{
"epoch": 0.1547469677959013,
"grad_norm": 2.1036202907562256,
"learning_rate": 0.0002973018599242125,
"loss": 0.7929,
"step": 185
},
{
"epoch": 0.1589293182768716,
"grad_norm": 1.0557819604873657,
"learning_rate": 0.0002968723000184662,
"loss": 0.7868,
"step": 190
},
{
"epoch": 0.16311166875784192,
"grad_norm": 0.9558168649673462,
"learning_rate": 0.00029641138163338907,
"loss": 0.7812,
"step": 195
},
{
"epoch": 0.1672940192388122,
"grad_norm": 0.771851122379303,
"learning_rate": 0.0002959192031789579,
"loss": 0.7846,
"step": 200
},
{
"epoch": 0.17147636971978253,
"grad_norm": 1.288865089416504,
"learning_rate": 0.0002953958697394391,
"loss": 0.777,
"step": 205
},
{
"epoch": 0.17565872020075282,
"grad_norm": 2.001302480697632,
"learning_rate": 0.000294841493050952,
"loss": 0.7797,
"step": 210
},
{
"epoch": 0.17984107068172314,
"grad_norm": 0.7828574776649475,
"learning_rate": 0.0002942561914776124,
"loss": 0.7815,
"step": 215
},
{
"epoch": 0.18402342116269343,
"grad_norm": 1.4854490756988525,
"learning_rate": 0.00029364008998626086,
"loss": 0.7608,
"step": 220
},
{
"epoch": 0.18820577164366373,
"grad_norm": 1.1406800746917725,
"learning_rate": 0.00029299332011978107,
"loss": 0.747,
"step": 225
},
{
"epoch": 0.19238812212463405,
"grad_norm": 1.7346460819244385,
"learning_rate": 0.00029231601996901433,
"loss": 0.7555,
"step": 230
},
{
"epoch": 0.19657047260560434,
"grad_norm": 1.7754813432693481,
"learning_rate": 0.0002916083341432763,
"loss": 0.7626,
"step": 235
},
{
"epoch": 0.20075282308657466,
"grad_norm": 1.2126661539077759,
"learning_rate": 0.00029087041373948135,
"loss": 0.7237,
"step": 240
},
{
"epoch": 0.20493517356754495,
"grad_norm": 1.930538535118103,
"learning_rate": 0.00029010241630988217,
"loss": 0.7672,
"step": 245
},
{
"epoch": 0.20911752404851527,
"grad_norm": 2.1792216300964355,
"learning_rate": 0.0002893045058284311,
"loss": 0.7416,
"step": 250
},
{
"epoch": 0.21329987452948557,
"grad_norm": 1.416754961013794,
"learning_rate": 0.0002884768526557703,
"loss": 0.7196,
"step": 255
},
{
"epoch": 0.2174822250104559,
"grad_norm": 1.6103583574295044,
"learning_rate": 0.0002876196335028581,
"loss": 0.7397,
"step": 260
},
{
"epoch": 0.22166457549142618,
"grad_norm": 1.0755459070205688,
"learning_rate": 0.0002867330313932402,
"loss": 0.7644,
"step": 265
},
{
"epoch": 0.2258469259723965,
"grad_norm": 0.8303298354148865,
"learning_rate": 0.000285817235623972,
"loss": 0.7393,
"step": 270
},
{
"epoch": 0.2300292764533668,
"grad_norm": 1.4747998714447021,
"learning_rate": 0.00028487244172520246,
"loss": 0.7121,
"step": 275
},
{
"epoch": 0.23421162693433709,
"grad_norm": 2.582953929901123,
"learning_rate": 0.0002838988514184267,
"loss": 0.7361,
"step": 280
},
{
"epoch": 0.2383939774153074,
"grad_norm": 2.413325309753418,
"learning_rate": 0.0002828966725734167,
"loss": 0.74,
"step": 285
},
{
"epoch": 0.2425763278962777,
"grad_norm": 0.7637856006622314,
"learning_rate": 0.0002818661191638393,
"loss": 0.7096,
"step": 290
},
{
"epoch": 0.24675867837724802,
"grad_norm": 1.757056713104248,
"learning_rate": 0.0002808074112215711,
"loss": 0.7205,
"step": 295
},
{
"epoch": 0.25094102885821834,
"grad_norm": 0.8766753077507019,
"learning_rate": 0.0002797207747897198,
"loss": 0.7098,
"step": 300
},
{
"epoch": 0.2551233793391886,
"grad_norm": 1.449209213256836,
"learning_rate": 0.00027860644187436195,
"loss": 0.725,
"step": 305
},
{
"epoch": 0.2593057298201589,
"grad_norm": 0.6825206875801086,
"learning_rate": 0.0002774646503950078,
"loss": 0.6938,
"step": 310
},
{
"epoch": 0.26348808030112925,
"grad_norm": 1.119585394859314,
"learning_rate": 0.0002762956441338036,
"loss": 0.698,
"step": 315
},
{
"epoch": 0.26767043078209957,
"grad_norm": 0.9425824880599976,
"learning_rate": 0.0002750996726834817,
"loss": 0.7189,
"step": 320
},
{
"epoch": 0.27185278126306983,
"grad_norm": 0.5979897975921631,
"learning_rate": 0.0002738769913940706,
"loss": 0.7039,
"step": 325
},
{
"epoch": 0.27603513174404015,
"grad_norm": 1.8769757747650146,
"learning_rate": 0.00027262786131837573,
"loss": 0.7035,
"step": 330
},
{
"epoch": 0.2802174822250105,
"grad_norm": 1.1395800113677979,
"learning_rate": 0.0002713525491562421,
"loss": 0.6898,
"step": 335
},
{
"epoch": 0.28439983270598074,
"grad_norm": 1.0573526620864868,
"learning_rate": 0.0002700513271976119,
"loss": 0.7042,
"step": 340
},
{
"epoch": 0.28858218318695106,
"grad_norm": 0.5185459852218628,
"learning_rate": 0.0002687244732643881,
"loss": 0.6914,
"step": 345
},
{
"epoch": 0.2927645336679214,
"grad_norm": 2.7914602756500244,
"learning_rate": 0.0002673722706511174,
"loss": 0.7049,
"step": 350
},
{
"epoch": 0.2969468841488917,
"grad_norm": 3.0459792613983154,
"learning_rate": 0.000265995008064504,
"loss": 0.7148,
"step": 355
},
{
"epoch": 0.30112923462986196,
"grad_norm": 2.1906723976135254,
"learning_rate": 0.00026459297956176885,
"loss": 0.7074,
"step": 360
},
{
"epoch": 0.3053115851108323,
"grad_norm": 1.6257227659225464,
"learning_rate": 0.00026316648448786536,
"loss": 0.6985,
"step": 365
},
{
"epoch": 0.3094939355918026,
"grad_norm": 0.7152910828590393,
"learning_rate": 0.00026171582741156725,
"loss": 0.6875,
"step": 370
},
{
"epoch": 0.3136762860727729,
"grad_norm": 2.4449851512908936,
"learning_rate": 0.0002602413180604401,
"loss": 0.6787,
"step": 375
},
{
"epoch": 0.3178586365537432,
"grad_norm": 0.5180588960647583,
"learning_rate": 0.000258743271254712,
"loss": 0.6724,
"step": 380
},
{
"epoch": 0.3220409870347135,
"grad_norm": 1.5739381313323975,
"learning_rate": 0.00025722200684005715,
"loss": 0.7076,
"step": 385
},
{
"epoch": 0.32622333751568383,
"grad_norm": 0.8701817989349365,
"learning_rate": 0.00025567784961930546,
"loss": 0.6841,
"step": 390
},
{
"epoch": 0.3304056879966541,
"grad_norm": 1.474747896194458,
"learning_rate": 0.0002541111292830951,
"loss": 0.713,
"step": 395
},
{
"epoch": 0.3345880384776244,
"grad_norm": 1.8884798288345337,
"learning_rate": 0.00025252218033947993,
"loss": 0.6893,
"step": 400
},
{
"epoch": 0.33877038895859474,
"grad_norm": 0.8834472894668579,
"learning_rate": 0.00025091134204250997,
"loss": 0.6966,
"step": 405
},
{
"epoch": 0.34295273943956506,
"grad_norm": 0.6324520707130432,
"learning_rate": 0.00024927895831979745,
"loss": 0.6882,
"step": 410
},
{
"epoch": 0.3471350899205353,
"grad_norm": 2.353163480758667,
"learning_rate": 0.00024762537769908535,
"loss": 0.6829,
"step": 415
},
{
"epoch": 0.35131744040150564,
"grad_norm": 1.3682096004486084,
"learning_rate": 0.00024595095323383365,
"loss": 0.6912,
"step": 420
},
{
"epoch": 0.35549979088247596,
"grad_norm": 0.9962055087089539,
"learning_rate": 0.0002442560424278399,
"loss": 0.6857,
"step": 425
},
{
"epoch": 0.3596821413634463,
"grad_norm": 1.1282930374145508,
"learning_rate": 0.00024254100715890846,
"loss": 0.6696,
"step": 430
},
{
"epoch": 0.36386449184441655,
"grad_norm": 0.934388279914856,
"learning_rate": 0.00024080621360158717,
"loss": 0.6841,
"step": 435
},
{
"epoch": 0.36804684232538687,
"grad_norm": 1.4339077472686768,
"learning_rate": 0.00023905203214898558,
"loss": 0.6705,
"step": 440
},
{
"epoch": 0.3722291928063572,
"grad_norm": 1.0309265851974487,
"learning_rate": 0.00023727883733369292,
"loss": 0.6706,
"step": 445
},
{
"epoch": 0.37641154328732745,
"grad_norm": 1.9208811521530151,
"learning_rate": 0.00023548700774781242,
"loss": 0.6637,
"step": 450
},
{
"epoch": 0.3805938937682978,
"grad_norm": 1.0379974842071533,
"learning_rate": 0.00023367692596212858,
"loss": 0.68,
"step": 455
},
{
"epoch": 0.3847762442492681,
"grad_norm": 1.852662444114685,
"learning_rate": 0.00023184897844442495,
"loss": 0.6589,
"step": 460
},
{
"epoch": 0.3889585947302384,
"grad_norm": 1.1750479936599731,
"learning_rate": 0.00023000355547697027,
"loss": 0.6675,
"step": 465
},
{
"epoch": 0.3931409452112087,
"grad_norm": 1.6473002433776855,
"learning_rate": 0.00022814105107318952,
"loss": 0.6709,
"step": 470
},
{
"epoch": 0.397323295692179,
"grad_norm": 1.2356650829315186,
"learning_rate": 0.00022626186289353913,
"loss": 0.6652,
"step": 475
},
{
"epoch": 0.4015056461731493,
"grad_norm": 1.1605840921401978,
"learning_rate": 0.00022436639216060275,
"loss": 0.6698,
"step": 480
},
{
"epoch": 0.40568799665411964,
"grad_norm": 1.5935866832733154,
"learning_rate": 0.00022245504357342716,
"loss": 0.6688,
"step": 485
},
{
"epoch": 0.4098703471350899,
"grad_norm": 0.810558557510376,
"learning_rate": 0.00022052822522111522,
"loss": 0.6524,
"step": 490
},
{
"epoch": 0.41405269761606023,
"grad_norm": 0.7008018493652344,
"learning_rate": 0.00021858634849569576,
"loss": 0.6924,
"step": 495
},
{
"epoch": 0.41823504809703055,
"grad_norm": 1.7558863162994385,
"learning_rate": 0.0002166298280042877,
"loss": 0.6711,
"step": 500
},
{
"epoch": 0.4224173985780008,
"grad_norm": 1.573688268661499,
"learning_rate": 0.00021465908148057787,
"loss": 0.6674,
"step": 505
},
{
"epoch": 0.42659974905897113,
"grad_norm": 1.4761265516281128,
"learning_rate": 0.00021267452969563153,
"loss": 0.6706,
"step": 510
},
{
"epoch": 0.43078209953994145,
"grad_norm": 1.7749208211898804,
"learning_rate": 0.00021067659636805403,
"loss": 0.6469,
"step": 515
},
{
"epoch": 0.4349644500209118,
"grad_norm": 1.0164939165115356,
"learning_rate": 0.00020866570807352337,
"loss": 0.6764,
"step": 520
},
{
"epoch": 0.43914680050188204,
"grad_norm": 1.6237319707870483,
"learning_rate": 0.00020664229415371266,
"loss": 0.6694,
"step": 525
},
{
"epoch": 0.44332915098285236,
"grad_norm": 1.5586035251617432,
"learning_rate": 0.00020460678662462194,
"loss": 0.6562,
"step": 530
},
{
"epoch": 0.4475115014638227,
"grad_norm": 1.771645188331604,
"learning_rate": 0.0002025596200843394,
"loss": 0.6622,
"step": 535
},
{
"epoch": 0.451693851944793,
"grad_norm": 0.5951160788536072,
"learning_rate": 0.0002005012316202506,
"loss": 0.651,
"step": 540
},
{
"epoch": 0.45587620242576327,
"grad_norm": 0.793093740940094,
"learning_rate": 0.00019843206071571692,
"loss": 0.6634,
"step": 545
},
{
"epoch": 0.4600585529067336,
"grad_norm": 1.0352814197540283,
"learning_rate": 0.0001963525491562421,
"loss": 0.6636,
"step": 550
},
{
"epoch": 0.4642409033877039,
"grad_norm": 0.843008816242218,
"learning_rate": 0.00019426314093514717,
"loss": 0.6407,
"step": 555
},
{
"epoch": 0.46842325386867417,
"grad_norm": 1.7540709972381592,
"learning_rate": 0.00019216428215877425,
"loss": 0.638,
"step": 560
},
{
"epoch": 0.4726056043496445,
"grad_norm": 0.5828922390937805,
"learning_rate": 0.00019005642095123895,
"loss": 0.6625,
"step": 565
},
{
"epoch": 0.4767879548306148,
"grad_norm": 0.7700462937355042,
"learning_rate": 0.00018794000735875208,
"loss": 0.6428,
"step": 570
},
{
"epoch": 0.48097030531158513,
"grad_norm": 0.8344655632972717,
"learning_rate": 0.00018581549325353126,
"loss": 0.6553,
"step": 575
},
{
"epoch": 0.4851526557925554,
"grad_norm": 1.2676873207092285,
"learning_rate": 0.000183683332237322,
"loss": 0.6645,
"step": 580
},
{
"epoch": 0.4893350062735257,
"grad_norm": 1.4888837337493896,
"learning_rate": 0.00018154397954454993,
"loss": 0.6859,
"step": 585
},
{
"epoch": 0.49351735675449604,
"grad_norm": 0.7020601034164429,
"learning_rate": 0.00017939789194512472,
"loss": 0.6456,
"step": 590
},
{
"epoch": 0.49769970723546636,
"grad_norm": 1.1964813470840454,
"learning_rate": 0.00017724552764691545,
"loss": 0.6594,
"step": 595
},
{
"epoch": 0.5018820577164367,
"grad_norm": 1.1332772970199585,
"learning_rate": 0.00017508734619791966,
"loss": 0.6606,
"step": 600
},
{
"epoch": 0.506064408197407,
"grad_norm": 1.6122368574142456,
"learning_rate": 0.00017292380838814577,
"loss": 0.6468,
"step": 605
},
{
"epoch": 0.5102467586783772,
"grad_norm": 0.8950415849685669,
"learning_rate": 0.00017075537615123042,
"loss": 0.6615,
"step": 610
},
{
"epoch": 0.5144291091593476,
"grad_norm": 1.9138753414154053,
"learning_rate": 0.00016858251246581216,
"loss": 0.6683,
"step": 615
},
{
"epoch": 0.5186114596403179,
"grad_norm": 0.9320158362388611,
"learning_rate": 0.00016640568125668117,
"loss": 0.6734,
"step": 620
},
{
"epoch": 0.5227938101212881,
"grad_norm": 1.2331713438034058,
"learning_rate": 0.00016422534729572738,
"loss": 0.6582,
"step": 625
},
{
"epoch": 0.5269761606022585,
"grad_norm": 1.1182340383529663,
"learning_rate": 0.00016204197610270816,
"loss": 0.6533,
"step": 630
},
{
"epoch": 0.5311585110832288,
"grad_norm": 0.6500148773193359,
"learning_rate": 0.00015985603384585542,
"loss": 0.6396,
"step": 635
},
{
"epoch": 0.5353408615641991,
"grad_norm": 0.9531376361846924,
"learning_rate": 0.00015766798724234506,
"loss": 0.6337,
"step": 640
},
{
"epoch": 0.5395232120451694,
"grad_norm": 0.7729771733283997,
"learning_rate": 0.00015547830345864885,
"loss": 0.6498,
"step": 645
},
{
"epoch": 0.5437055625261397,
"grad_norm": 0.6831007599830627,
"learning_rate": 0.0001532874500107902,
"loss": 0.6404,
"step": 650
},
{
"epoch": 0.54788791300711,
"grad_norm": 1.2160038948059082,
"learning_rate": 0.00015109589466452594,
"loss": 0.658,
"step": 655
},
{
"epoch": 0.5520702634880803,
"grad_norm": 1.015773057937622,
"learning_rate": 0.00014890410533547404,
"loss": 0.6507,
"step": 660
},
{
"epoch": 0.5562526139690506,
"grad_norm": 1.208256721496582,
"learning_rate": 0.00014671254998920976,
"loss": 0.6399,
"step": 665
},
{
"epoch": 0.560434964450021,
"grad_norm": 0.7431871294975281,
"learning_rate": 0.00014452169654135115,
"loss": 0.6534,
"step": 670
},
{
"epoch": 0.5646173149309912,
"grad_norm": 0.6661595702171326,
"learning_rate": 0.00014233201275765494,
"loss": 0.6343,
"step": 675
},
{
"epoch": 0.5687996654119615,
"grad_norm": 1.2753201723098755,
"learning_rate": 0.00014014396615414458,
"loss": 0.6296,
"step": 680
},
{
"epoch": 0.5729820158929319,
"grad_norm": 1.4110949039459229,
"learning_rate": 0.00013795802389729184,
"loss": 0.6452,
"step": 685
},
{
"epoch": 0.5771643663739021,
"grad_norm": 1.4824358224868774,
"learning_rate": 0.00013577465270427262,
"loss": 0.6348,
"step": 690
},
{
"epoch": 0.5813467168548725,
"grad_norm": 1.8900264501571655,
"learning_rate": 0.00013359431874331886,
"loss": 0.6509,
"step": 695
},
{
"epoch": 0.5855290673358428,
"grad_norm": 1.652632236480713,
"learning_rate": 0.0001314174875341878,
"loss": 0.6206,
"step": 700
},
{
"epoch": 0.589711417816813,
"grad_norm": 1.1248772144317627,
"learning_rate": 0.00012924462384876953,
"loss": 0.6299,
"step": 705
},
{
"epoch": 0.5938937682977834,
"grad_norm": 0.7448098659515381,
"learning_rate": 0.00012707619161185423,
"loss": 0.6483,
"step": 710
},
{
"epoch": 0.5980761187787537,
"grad_norm": 0.6708864569664001,
"learning_rate": 0.00012491265380208032,
"loss": 0.6473,
"step": 715
},
{
"epoch": 0.6022584692597239,
"grad_norm": 0.8381022810935974,
"learning_rate": 0.00012275447235308453,
"loss": 0.6356,
"step": 720
},
{
"epoch": 0.6064408197406943,
"grad_norm": 1.3462333679199219,
"learning_rate": 0.00012060210805487529,
"loss": 0.6388,
"step": 725
},
{
"epoch": 0.6106231702216646,
"grad_norm": 1.2015129327774048,
"learning_rate": 0.00011845602045545008,
"loss": 0.6258,
"step": 730
},
{
"epoch": 0.6148055207026348,
"grad_norm": 0.7825962901115417,
"learning_rate": 0.00011631666776267803,
"loss": 0.6401,
"step": 735
},
{
"epoch": 0.6189878711836052,
"grad_norm": 0.9470372200012207,
"learning_rate": 0.00011418450674646868,
"loss": 0.6501,
"step": 740
},
{
"epoch": 0.6231702216645755,
"grad_norm": 0.9243600368499756,
"learning_rate": 0.00011205999264124786,
"loss": 0.6195,
"step": 745
},
{
"epoch": 0.6273525721455459,
"grad_norm": 1.931402325630188,
"learning_rate": 0.00010994357904876106,
"loss": 0.6264,
"step": 750
},
{
"epoch": 0.6315349226265161,
"grad_norm": 1.7754958868026733,
"learning_rate": 0.00010783571784122577,
"loss": 0.6351,
"step": 755
},
{
"epoch": 0.6357172731074864,
"grad_norm": 0.627479076385498,
"learning_rate": 0.00010573685906485282,
"loss": 0.6395,
"step": 760
},
{
"epoch": 0.6398996235884568,
"grad_norm": 2.4568421840667725,
"learning_rate": 0.0001036474508437579,
"loss": 0.6257,
"step": 765
},
{
"epoch": 0.644081974069427,
"grad_norm": 1.9016671180725098,
"learning_rate": 0.0001015679392842831,
"loss": 0.6446,
"step": 770
},
{
"epoch": 0.6482643245503973,
"grad_norm": 0.7173120975494385,
"learning_rate": 9.949876837974944e-05,
"loss": 0.6312,
"step": 775
},
{
"epoch": 0.6524466750313677,
"grad_norm": 1.9048292636871338,
"learning_rate": 9.744037991566058e-05,
"loss": 0.622,
"step": 780
},
{
"epoch": 0.6566290255123379,
"grad_norm": 1.2454332113265991,
"learning_rate": 9.5393213375378e-05,
"loss": 0.6219,
"step": 785
},
{
"epoch": 0.6608113759933082,
"grad_norm": 1.0283215045928955,
"learning_rate": 9.33577058462873e-05,
"loss": 0.6236,
"step": 790
},
{
"epoch": 0.6649937264742786,
"grad_norm": 1.8116226196289062,
"learning_rate": 9.133429192647661e-05,
"loss": 0.6244,
"step": 795
},
{
"epoch": 0.6691760769552488,
"grad_norm": 0.8519335389137268,
"learning_rate": 8.932340363194595e-05,
"loss": 0.6253,
"step": 800
},
{
"epoch": 0.6733584274362192,
"grad_norm": 1.004647135734558,
"learning_rate": 8.73254703043685e-05,
"loss": 0.6278,
"step": 805
},
{
"epoch": 0.6775407779171895,
"grad_norm": 0.945331871509552,
"learning_rate": 8.534091851942214e-05,
"loss": 0.6251,
"step": 810
},
{
"epoch": 0.6817231283981597,
"grad_norm": 0.4643709659576416,
"learning_rate": 8.337017199571235e-05,
"loss": 0.6298,
"step": 815
},
{
"epoch": 0.6859054788791301,
"grad_norm": 0.8933060169219971,
"learning_rate": 8.141365150430421e-05,
"loss": 0.6419,
"step": 820
},
{
"epoch": 0.6900878293601004,
"grad_norm": 2.5518875122070312,
"learning_rate": 7.947177477888472e-05,
"loss": 0.6424,
"step": 825
},
{
"epoch": 0.6942701798410706,
"grad_norm": 0.7830976247787476,
"learning_rate": 7.754495642657282e-05,
"loss": 0.6292,
"step": 830
},
{
"epoch": 0.698452530322041,
"grad_norm": 1.2565546035766602,
"learning_rate": 7.563360783939722e-05,
"loss": 0.6308,
"step": 835
},
{
"epoch": 0.7026348808030113,
"grad_norm": 1.588156819343567,
"learning_rate": 7.373813710646083e-05,
"loss": 0.6249,
"step": 840
},
{
"epoch": 0.7068172312839816,
"grad_norm": 0.6486766934394836,
"learning_rate": 7.185894892681048e-05,
"loss": 0.6308,
"step": 845
},
{
"epoch": 0.7109995817649519,
"grad_norm": 0.7454473972320557,
"learning_rate": 6.999644452302975e-05,
"loss": 0.6267,
"step": 850
},
{
"epoch": 0.7151819322459222,
"grad_norm": 0.6658061146736145,
"learning_rate": 6.815102155557501e-05,
"loss": 0.6162,
"step": 855
},
{
"epoch": 0.7193642827268926,
"grad_norm": 1.0052908658981323,
"learning_rate": 6.632307403787138e-05,
"loss": 0.644,
"step": 860
},
{
"epoch": 0.7235466332078628,
"grad_norm": 0.7472626566886902,
"learning_rate": 6.451299225218754e-05,
"loss": 0.616,
"step": 865
},
{
"epoch": 0.7277289836888331,
"grad_norm": 0.587893009185791,
"learning_rate": 6.27211626663071e-05,
"loss": 0.6318,
"step": 870
},
{
"epoch": 0.7319113341698035,
"grad_norm": 0.898607611656189,
"learning_rate": 6.0947967851014405e-05,
"loss": 0.6409,
"step": 875
},
{
"epoch": 0.7360936846507737,
"grad_norm": 0.7444003224372864,
"learning_rate": 5.919378639841281e-05,
"loss": 0.6214,
"step": 880
},
{
"epoch": 0.740276035131744,
"grad_norm": 1.199029564857483,
"learning_rate": 5.745899284109154e-05,
"loss": 0.6184,
"step": 885
},
{
"epoch": 0.7444583856127144,
"grad_norm": 1.2874826192855835,
"learning_rate": 5.57439575721601e-05,
"loss": 0.6233,
"step": 890
},
{
"epoch": 0.7486407360936846,
"grad_norm": 1.3848364353179932,
"learning_rate": 5.4049046766166335e-05,
"loss": 0.6043,
"step": 895
},
{
"epoch": 0.7528230865746549,
"grad_norm": 0.6463631987571716,
"learning_rate": 5.237462230091467e-05,
"loss": 0.6361,
"step": 900
},
{
"epoch": 0.7570054370556253,
"grad_norm": 1.0429089069366455,
"learning_rate": 5.07210416802025e-05,
"loss": 0.6206,
"step": 905
},
{
"epoch": 0.7611877875365956,
"grad_norm": 0.7253705263137817,
"learning_rate": 4.908865795748999e-05,
"loss": 0.6312,
"step": 910
},
{
"epoch": 0.7653701380175659,
"grad_norm": 0.498542457818985,
"learning_rate": 4.74778196605201e-05,
"loss": 0.6421,
"step": 915
},
{
"epoch": 0.7695524884985362,
"grad_norm": 0.6464426517486572,
"learning_rate": 4.58888707169049e-05,
"loss": 0.6047,
"step": 920
},
{
"epoch": 0.7737348389795065,
"grad_norm": 0.6658442616462708,
"learning_rate": 4.432215038069449e-05,
"loss": 0.623,
"step": 925
},
{
"epoch": 0.7779171894604768,
"grad_norm": 0.6936389207839966,
"learning_rate": 4.277799315994286e-05,
"loss": 0.6226,
"step": 930
},
{
"epoch": 0.7820995399414471,
"grad_norm": 0.5900949835777283,
"learning_rate": 4.125672874528797e-05,
"loss": 0.6314,
"step": 935
},
{
"epoch": 0.7862818904224174,
"grad_norm": 0.9340611100196838,
"learning_rate": 3.97586819395599e-05,
"loss": 0.6252,
"step": 940
},
{
"epoch": 0.7904642409033877,
"grad_norm": 1.272733211517334,
"learning_rate": 3.8284172588432716e-05,
"loss": 0.6236,
"step": 945
},
{
"epoch": 0.794646591384358,
"grad_norm": 0.7782835364341736,
"learning_rate": 3.6833515512134606e-05,
"loss": 0.6096,
"step": 950
},
{
"epoch": 0.7988289418653283,
"grad_norm": 0.6020464301109314,
"learning_rate": 3.540702043823113e-05,
"loss": 0.6124,
"step": 955
},
{
"epoch": 0.8030112923462986,
"grad_norm": 0.675359845161438,
"learning_rate": 3.4004991935496004e-05,
"loss": 0.5955,
"step": 960
},
{
"epoch": 0.8071936428272689,
"grad_norm": 0.5639395117759705,
"learning_rate": 3.262772934888265e-05,
"loss": 0.6069,
"step": 965
},
{
"epoch": 0.8113759933082393,
"grad_norm": 0.4711320698261261,
"learning_rate": 3.1275526735611896e-05,
"loss": 0.6102,
"step": 970
},
{
"epoch": 0.8155583437892095,
"grad_norm": 1.0711911916732788,
"learning_rate": 2.9948672802388135e-05,
"loss": 0.6391,
"step": 975
},
{
"epoch": 0.8197406942701798,
"grad_norm": 0.4709874987602234,
"learning_rate": 2.8647450843757897e-05,
"loss": 0.6186,
"step": 980
},
{
"epoch": 0.8239230447511502,
"grad_norm": 0.5402533411979675,
"learning_rate": 2.7372138681624244e-05,
"loss": 0.613,
"step": 985
},
{
"epoch": 0.8281053952321205,
"grad_norm": 0.6864106059074402,
"learning_rate": 2.6123008605929375e-05,
"loss": 0.6215,
"step": 990
},
{
"epoch": 0.8322877457130907,
"grad_norm": 0.5939123630523682,
"learning_rate": 2.4900327316518326e-05,
"loss": 0.6168,
"step": 995
},
{
"epoch": 0.8364700961940611,
"grad_norm": 0.7122395038604736,
"learning_rate": 2.3704355866196373e-05,
"loss": 0.6053,
"step": 1000
},
{
"epoch": 0.8406524466750314,
"grad_norm": 0.6920621395111084,
"learning_rate": 2.2535349604992153e-05,
"loss": 0.6097,
"step": 1005
},
{
"epoch": 0.8448347971560016,
"grad_norm": 1.7864691019058228,
"learning_rate": 2.1393558125638066e-05,
"loss": 0.6382,
"step": 1010
},
{
"epoch": 0.849017147636972,
"grad_norm": 0.7472600936889648,
"learning_rate": 2.027922521028018e-05,
"loss": 0.6159,
"step": 1015
},
{
"epoch": 0.8531994981179423,
"grad_norm": 1.722579836845398,
"learning_rate": 1.9192588778428842e-05,
"loss": 0.6011,
"step": 1020
},
{
"epoch": 0.8573818485989126,
"grad_norm": 0.5121240019798279,
"learning_rate": 1.813388083616068e-05,
"loss": 0.6031,
"step": 1025
},
{
"epoch": 0.8615641990798829,
"grad_norm": 0.5083288550376892,
"learning_rate": 1.7103327426583265e-05,
"loss": 0.5845,
"step": 1030
},
{
"epoch": 0.8657465495608532,
"grad_norm": 0.46453267335891724,
"learning_rate": 1.6101148581573274e-05,
"loss": 0.6031,
"step": 1035
},
{
"epoch": 0.8699289000418235,
"grad_norm": 0.8352246880531311,
"learning_rate": 1.5127558274797535e-05,
"loss": 0.6024,
"step": 1040
},
{
"epoch": 0.8741112505227938,
"grad_norm": 0.8040021061897278,
"learning_rate": 1.4182764376028006e-05,
"loss": 0.635,
"step": 1045
},
{
"epoch": 0.8782936010037641,
"grad_norm": 0.7152721881866455,
"learning_rate": 1.326696860675981e-05,
"loss": 0.6162,
"step": 1050
},
{
"epoch": 0.8824759514847345,
"grad_norm": 0.6833348274230957,
"learning_rate": 1.2380366497141886e-05,
"loss": 0.6217,
"step": 1055
},
{
"epoch": 0.8866583019657047,
"grad_norm": 0.6803585886955261,
"learning_rate": 1.1523147344229716e-05,
"loss": 0.6218,
"step": 1060
},
{
"epoch": 0.890840652446675,
"grad_norm": 0.9396490454673767,
"learning_rate": 1.069549417156887e-05,
"loss": 0.6176,
"step": 1065
},
{
"epoch": 0.8950230029276454,
"grad_norm": 0.7006051540374756,
"learning_rate": 9.89758369011781e-06,
"loss": 0.6123,
"step": 1070
},
{
"epoch": 0.8992053534086156,
"grad_norm": 0.5886880159378052,
"learning_rate": 9.129586260518634e-06,
"loss": 0.5923,
"step": 1075
},
{
"epoch": 0.903387703889586,
"grad_norm": 0.5032349228858948,
"learning_rate": 8.391665856723655e-06,
"loss": 0.619,
"step": 1080
},
{
"epoch": 0.9075700543705563,
"grad_norm": 0.51763916015625,
"learning_rate": 7.683980030985654e-06,
"loss": 0.6039,
"step": 1085
},
{
"epoch": 0.9117524048515265,
"grad_norm": 0.588211715221405,
"learning_rate": 7.006679880218974e-06,
"loss": 0.6057,
"step": 1090
},
{
"epoch": 0.9159347553324969,
"grad_norm": 0.6631506681442261,
"learning_rate": 6.359910013739122e-06,
"loss": 0.6106,
"step": 1095
},
{
"epoch": 0.9201171058134672,
"grad_norm": 0.5523665547370911,
"learning_rate": 5.743808522387544e-06,
"loss": 0.6058,
"step": 1100
},
{
"epoch": 0.9242994562944374,
"grad_norm": 0.5903011560440063,
"learning_rate": 5.158506949047975e-06,
"loss": 0.6321,
"step": 1105
},
{
"epoch": 0.9284818067754078,
"grad_norm": 0.6151677966117859,
"learning_rate": 4.604130260560873e-06,
"loss": 0.6171,
"step": 1110
},
{
"epoch": 0.9326641572563781,
"grad_norm": 0.5324861407279968,
"learning_rate": 4.080796821042082e-06,
"loss": 0.6184,
"step": 1115
},
{
"epoch": 0.9368465077373483,
"grad_norm": 0.5442612171173096,
"learning_rate": 3.5886183666109405e-06,
"loss": 0.6069,
"step": 1120
},
{
"epoch": 0.9410288582183187,
"grad_norm": 0.4981847405433655,
"learning_rate": 3.1276999815337544e-06,
"loss": 0.62,
"step": 1125
},
{
"epoch": 0.945211208699289,
"grad_norm": 0.45795848965644836,
"learning_rate": 2.6981400757874584e-06,
"loss": 0.6027,
"step": 1130
},
{
"epoch": 0.9493935591802594,
"grad_norm": 0.78538978099823,
"learning_rate": 2.3000303640481386e-06,
"loss": 0.6084,
"step": 1135
},
{
"epoch": 0.9535759096612296,
"grad_norm": 0.5453292727470398,
"learning_rate": 1.9334558461092663e-06,
"loss": 0.6043,
"step": 1140
},
{
"epoch": 0.9577582601421999,
"grad_norm": 0.5924518704414368,
"learning_rate": 1.598494788733462e-06,
"loss": 0.6033,
"step": 1145
},
{
"epoch": 0.9619406106231703,
"grad_norm": 0.5484139323234558,
"learning_rate": 1.2952187089419642e-06,
"loss": 0.616,
"step": 1150
},
{
"epoch": 0.9661229611041405,
"grad_norm": 0.5363529324531555,
"learning_rate": 1.0236923587450263e-06,
"loss": 0.6196,
"step": 1155
},
{
"epoch": 0.9703053115851108,
"grad_norm": 0.6777392625808716,
"learning_rate": 7.839737113168931e-07,
"loss": 0.6102,
"step": 1160
},
{
"epoch": 0.9744876620660812,
"grad_norm": 0.5920884609222412,
"learning_rate": 5.761139486180178e-07,
"loss": 0.6075,
"step": 1165
},
{
"epoch": 0.9786700125470514,
"grad_norm": 0.4196658730506897,
"learning_rate": 4.0015745046725336e-07,
"loss": 0.6017,
"step": 1170
},
{
"epoch": 0.9828523630280217,
"grad_norm": 0.6661626100540161,
"learning_rate": 2.5614178506644934e-07,
"loss": 0.5787,
"step": 1175
},
{
"epoch": 0.9870347135089921,
"grad_norm": 0.6183582544326782,
"learning_rate": 1.4409770097926765e-07,
"loss": 0.6188,
"step": 1180
},
{
"epoch": 0.9912170639899623,
"grad_norm": 0.5524072051048279,
"learning_rate": 6.40491205661009e-08,
"loss": 0.6089,
"step": 1185
},
{
"epoch": 0.9953994144709327,
"grad_norm": 0.5690078735351562,
"learning_rate": 1.6013134876491362e-08,
"loss": 0.6079,
"step": 1190
},
{
"epoch": 0.999581764951903,
"grad_norm": 0.4828049838542938,
"learning_rate": 0.0,
"loss": 0.5901,
"step": 1195
},
{
"epoch": 0.999581764951903,
"eval_loss": 1.2799842357635498,
"eval_runtime": 0.8401,
"eval_samples_per_second": 5.951,
"eval_steps_per_second": 1.19,
"step": 1195
},
{
"epoch": 0.999581764951903,
"step": 1195,
"total_flos": 9.109418934146171e+17,
"train_loss": 1.440422640086218,
"train_runtime": 6570.4384,
"train_samples_per_second": 2.911,
"train_steps_per_second": 0.182
}
],
"logging_steps": 5,
"max_steps": 1195,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.109418934146171e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}