leixa's picture
Training in progress, step 500, checkpoint
7ed5bfb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4144271570014144,
"eval_steps": 42,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002828854314002829,
"eval_loss": 11.916790962219238,
"eval_runtime": 6.6784,
"eval_samples_per_second": 89.243,
"eval_steps_per_second": 11.23,
"step": 1
},
{
"epoch": 0.008486562942008486,
"grad_norm": 0.5188692212104797,
"learning_rate": 3e-05,
"loss": 11.92,
"step": 3
},
{
"epoch": 0.016973125884016973,
"grad_norm": 0.48136115074157715,
"learning_rate": 6e-05,
"loss": 11.9198,
"step": 6
},
{
"epoch": 0.02545968882602546,
"grad_norm": 0.5018041729927063,
"learning_rate": 9e-05,
"loss": 11.9077,
"step": 9
},
{
"epoch": 0.033946251768033946,
"grad_norm": 0.6158789992332458,
"learning_rate": 9.999588943391597e-05,
"loss": 11.9074,
"step": 12
},
{
"epoch": 0.042432814710042434,
"grad_norm": 0.6823853850364685,
"learning_rate": 9.99743108100344e-05,
"loss": 11.9042,
"step": 15
},
{
"epoch": 0.05091937765205092,
"grad_norm": 0.8695465922355652,
"learning_rate": 9.993424445916923e-05,
"loss": 11.8991,
"step": 18
},
{
"epoch": 0.0594059405940594,
"grad_norm": 0.9028195142745972,
"learning_rate": 9.987570520365104e-05,
"loss": 11.8853,
"step": 21
},
{
"epoch": 0.06789250353606789,
"grad_norm": 0.9360626935958862,
"learning_rate": 9.979871469976196e-05,
"loss": 11.8781,
"step": 24
},
{
"epoch": 0.07637906647807638,
"grad_norm": 0.8196120858192444,
"learning_rate": 9.970330142972401e-05,
"loss": 11.8677,
"step": 27
},
{
"epoch": 0.08486562942008487,
"grad_norm": 0.7162870168685913,
"learning_rate": 9.95895006911623e-05,
"loss": 11.8614,
"step": 30
},
{
"epoch": 0.09335219236209336,
"grad_norm": 0.5132609605789185,
"learning_rate": 9.945735458404681e-05,
"loss": 11.8498,
"step": 33
},
{
"epoch": 0.10183875530410184,
"grad_norm": 0.4584226608276367,
"learning_rate": 9.930691199511775e-05,
"loss": 11.8467,
"step": 36
},
{
"epoch": 0.11032531824611033,
"grad_norm": 0.29786884784698486,
"learning_rate": 9.91382285798002e-05,
"loss": 11.8511,
"step": 39
},
{
"epoch": 0.1188118811881188,
"grad_norm": 0.2889010012149811,
"learning_rate": 9.895136674161465e-05,
"loss": 11.8408,
"step": 42
},
{
"epoch": 0.1188118811881188,
"eval_loss": 11.839421272277832,
"eval_runtime": 6.332,
"eval_samples_per_second": 94.125,
"eval_steps_per_second": 11.845,
"step": 42
},
{
"epoch": 0.1272984441301273,
"grad_norm": 0.20692946016788483,
"learning_rate": 9.874639560909117e-05,
"loss": 11.8396,
"step": 45
},
{
"epoch": 0.13578500707213578,
"grad_norm": 0.23685254156589508,
"learning_rate": 9.852339101019574e-05,
"loss": 11.8316,
"step": 48
},
{
"epoch": 0.14427157001414428,
"grad_norm": 0.2432631552219391,
"learning_rate": 9.828243544427796e-05,
"loss": 11.8341,
"step": 51
},
{
"epoch": 0.15275813295615276,
"grad_norm": 0.11745542287826538,
"learning_rate": 9.802361805155097e-05,
"loss": 11.8301,
"step": 54
},
{
"epoch": 0.16124469589816123,
"grad_norm": 0.134097158908844,
"learning_rate": 9.774703458011453e-05,
"loss": 11.8359,
"step": 57
},
{
"epoch": 0.16973125884016974,
"grad_norm": 0.19680051505565643,
"learning_rate": 9.745278735053343e-05,
"loss": 11.8378,
"step": 60
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.10566498339176178,
"learning_rate": 9.714098521798465e-05,
"loss": 11.832,
"step": 63
},
{
"epoch": 0.1867043847241867,
"grad_norm": 0.1530551165342331,
"learning_rate": 9.681174353198687e-05,
"loss": 11.8363,
"step": 66
},
{
"epoch": 0.19519094766619519,
"grad_norm": 0.1929464042186737,
"learning_rate": 9.64651840937276e-05,
"loss": 11.8284,
"step": 69
},
{
"epoch": 0.2036775106082037,
"grad_norm": 0.17411480844020844,
"learning_rate": 9.610143511100354e-05,
"loss": 11.8314,
"step": 72
},
{
"epoch": 0.21216407355021216,
"grad_norm": 0.14971987903118134,
"learning_rate": 9.572063115079063e-05,
"loss": 11.832,
"step": 75
},
{
"epoch": 0.22065063649222066,
"grad_norm": 0.18370923399925232,
"learning_rate": 9.53229130894619e-05,
"loss": 11.8275,
"step": 78
},
{
"epoch": 0.22913719943422914,
"grad_norm": 0.26103201508522034,
"learning_rate": 9.490842806067095e-05,
"loss": 11.8278,
"step": 81
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.21483545005321503,
"learning_rate": 9.44773294009206e-05,
"loss": 11.825,
"step": 84
},
{
"epoch": 0.2376237623762376,
"eval_loss": 11.825268745422363,
"eval_runtime": 6.402,
"eval_samples_per_second": 93.097,
"eval_steps_per_second": 11.715,
"step": 84
},
{
"epoch": 0.24611032531824611,
"grad_norm": 0.28861185908317566,
"learning_rate": 9.40297765928369e-05,
"loss": 11.8219,
"step": 87
},
{
"epoch": 0.2545968882602546,
"grad_norm": 0.13039568066596985,
"learning_rate": 9.356593520616948e-05,
"loss": 11.8245,
"step": 90
},
{
"epoch": 0.26308345120226306,
"grad_norm": 0.1711033284664154,
"learning_rate": 9.308597683653975e-05,
"loss": 11.8246,
"step": 93
},
{
"epoch": 0.27157001414427157,
"grad_norm": 0.2108013778924942,
"learning_rate": 9.259007904196023e-05,
"loss": 11.8228,
"step": 96
},
{
"epoch": 0.28005657708628007,
"grad_norm": 0.1204076036810875,
"learning_rate": 9.207842527714767e-05,
"loss": 11.82,
"step": 99
},
{
"epoch": 0.28854314002828857,
"grad_norm": 0.1477670669555664,
"learning_rate": 9.155120482565521e-05,
"loss": 11.8189,
"step": 102
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.10705884546041489,
"learning_rate": 9.10086127298478e-05,
"loss": 11.8169,
"step": 105
},
{
"epoch": 0.3055162659123055,
"grad_norm": 0.29956066608428955,
"learning_rate": 9.045084971874738e-05,
"loss": 11.8205,
"step": 108
},
{
"epoch": 0.314002828854314,
"grad_norm": 0.13027921319007874,
"learning_rate": 8.987812213377424e-05,
"loss": 11.8168,
"step": 111
},
{
"epoch": 0.32248939179632247,
"grad_norm": 0.13590934872627258,
"learning_rate": 8.929064185241213e-05,
"loss": 11.819,
"step": 114
},
{
"epoch": 0.33097595473833097,
"grad_norm": 0.09333682060241699,
"learning_rate": 8.868862620982534e-05,
"loss": 11.8267,
"step": 117
},
{
"epoch": 0.33946251768033947,
"grad_norm": 0.12400602549314499,
"learning_rate": 8.807229791845673e-05,
"loss": 11.818,
"step": 120
},
{
"epoch": 0.347949080622348,
"grad_norm": 0.12071343511343002,
"learning_rate": 8.744188498563641e-05,
"loss": 11.8166,
"step": 123
},
{
"epoch": 0.3564356435643564,
"grad_norm": 0.1693616658449173,
"learning_rate": 8.679762062923175e-05,
"loss": 11.8183,
"step": 126
},
{
"epoch": 0.3564356435643564,
"eval_loss": 11.819117546081543,
"eval_runtime": 6.7651,
"eval_samples_per_second": 88.1,
"eval_steps_per_second": 11.086,
"step": 126
},
{
"epoch": 0.3649222065063649,
"grad_norm": 0.17696824669837952,
"learning_rate": 8.613974319136958e-05,
"loss": 11.8171,
"step": 129
},
{
"epoch": 0.3734087694483734,
"grad_norm": 0.11144156008958817,
"learning_rate": 8.54684960502629e-05,
"loss": 11.815,
"step": 132
},
{
"epoch": 0.38189533239038187,
"grad_norm": 0.16119325160980225,
"learning_rate": 8.478412753017433e-05,
"loss": 11.8134,
"step": 135
},
{
"epoch": 0.39038189533239037,
"grad_norm": 0.10234789550304413,
"learning_rate": 8.408689080954998e-05,
"loss": 11.8138,
"step": 138
},
{
"epoch": 0.3988684582743989,
"grad_norm": 0.12111084908246994,
"learning_rate": 8.33770438273574e-05,
"loss": 11.8152,
"step": 141
},
{
"epoch": 0.4073550212164074,
"grad_norm": 0.07516127824783325,
"learning_rate": 8.265484918766243e-05,
"loss": 11.8128,
"step": 144
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.23707596957683563,
"learning_rate": 8.192057406248028e-05,
"loss": 11.8202,
"step": 147
},
{
"epoch": 0.4243281471004243,
"grad_norm": 0.1228451356291771,
"learning_rate": 8.117449009293668e-05,
"loss": 11.8201,
"step": 150
},
{
"epoch": 0.4328147100424328,
"grad_norm": 0.16358840465545654,
"learning_rate": 8.041687328877567e-05,
"loss": 11.8141,
"step": 153
},
{
"epoch": 0.44130127298444133,
"grad_norm": 0.10192089527845383,
"learning_rate": 7.964800392625129e-05,
"loss": 11.8128,
"step": 156
},
{
"epoch": 0.4497878359264498,
"grad_norm": 0.07679455727338791,
"learning_rate": 7.886816644444098e-05,
"loss": 11.8124,
"step": 159
},
{
"epoch": 0.4582743988684583,
"grad_norm": 0.10075189918279648,
"learning_rate": 7.807764934001874e-05,
"loss": 11.8119,
"step": 162
},
{
"epoch": 0.4667609618104668,
"grad_norm": 0.1872919499874115,
"learning_rate": 7.727674506052743e-05,
"loss": 11.8203,
"step": 165
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.12166598439216614,
"learning_rate": 7.646574989618938e-05,
"loss": 11.8202,
"step": 168
},
{
"epoch": 0.4752475247524752,
"eval_loss": 11.818514823913574,
"eval_runtime": 6.4158,
"eval_samples_per_second": 92.895,
"eval_steps_per_second": 11.69,
"step": 168
},
{
"epoch": 0.4837340876944837,
"grad_norm": 0.13749000430107117,
"learning_rate": 7.564496387029532e-05,
"loss": 11.8156,
"step": 171
},
{
"epoch": 0.49222065063649223,
"grad_norm": 0.07802052795886993,
"learning_rate": 7.481469062821252e-05,
"loss": 11.8182,
"step": 174
},
{
"epoch": 0.5007072135785007,
"grad_norm": 0.150814950466156,
"learning_rate": 7.39752373250527e-05,
"loss": 11.8179,
"step": 177
},
{
"epoch": 0.5091937765205092,
"grad_norm": 0.1514790952205658,
"learning_rate": 7.312691451204178e-05,
"loss": 11.8099,
"step": 180
},
{
"epoch": 0.5176803394625177,
"grad_norm": 0.13362684845924377,
"learning_rate": 7.227003602163295e-05,
"loss": 11.8172,
"step": 183
},
{
"epoch": 0.5261669024045261,
"grad_norm": 0.09337490051984787,
"learning_rate": 7.14049188514063e-05,
"loss": 11.8184,
"step": 186
},
{
"epoch": 0.5346534653465347,
"grad_norm": 0.08015663921833038,
"learning_rate": 7.05318830467969e-05,
"loss": 11.8158,
"step": 189
},
{
"epoch": 0.5431400282885431,
"grad_norm": 0.16405069828033447,
"learning_rate": 6.965125158269619e-05,
"loss": 11.816,
"step": 192
},
{
"epoch": 0.5516265912305516,
"grad_norm": 0.14057497680187225,
"learning_rate": 6.876335024396872e-05,
"loss": 11.8147,
"step": 195
},
{
"epoch": 0.5601131541725601,
"grad_norm": 0.1409187614917755,
"learning_rate": 6.786850750493006e-05,
"loss": 11.8157,
"step": 198
},
{
"epoch": 0.5685997171145686,
"grad_norm": 0.1987845003604889,
"learning_rate": 6.696705440782938e-05,
"loss": 11.8185,
"step": 201
},
{
"epoch": 0.5770862800565771,
"grad_norm": 0.10339465737342834,
"learning_rate": 6.605932444038229e-05,
"loss": 11.815,
"step": 204
},
{
"epoch": 0.5855728429985856,
"grad_norm": 0.12926329672336578,
"learning_rate": 6.514565341239861e-05,
"loss": 11.818,
"step": 207
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.07949727028608322,
"learning_rate": 6.422637933155162e-05,
"loss": 11.8151,
"step": 210
},
{
"epoch": 0.594059405940594,
"eval_loss": 11.818094253540039,
"eval_runtime": 6.6444,
"eval_samples_per_second": 89.7,
"eval_steps_per_second": 11.288,
"step": 210
},
{
"epoch": 0.6025459688826026,
"grad_norm": 0.18179267644882202,
"learning_rate": 6.330184227833376e-05,
"loss": 11.8178,
"step": 213
},
{
"epoch": 0.611032531824611,
"grad_norm": 0.12221992760896683,
"learning_rate": 6.237238428024572e-05,
"loss": 11.8145,
"step": 216
},
{
"epoch": 0.6195190947666195,
"grad_norm": 0.07228324562311172,
"learning_rate": 6.143834918526527e-05,
"loss": 11.8137,
"step": 219
},
{
"epoch": 0.628005657708628,
"grad_norm": 0.11851081997156143,
"learning_rate": 6.0500082534642464e-05,
"loss": 11.8156,
"step": 222
},
{
"epoch": 0.6364922206506365,
"grad_norm": 0.08459550887346268,
"learning_rate": 5.955793143506863e-05,
"loss": 11.8108,
"step": 225
},
{
"epoch": 0.6449787835926449,
"grad_norm": 0.07968215644359589,
"learning_rate": 5.861224443026595e-05,
"loss": 11.8181,
"step": 228
},
{
"epoch": 0.6534653465346535,
"grad_norm": 0.1503295600414276,
"learning_rate": 5.766337137204579e-05,
"loss": 11.8167,
"step": 231
},
{
"epoch": 0.6619519094766619,
"grad_norm": 0.08652139455080032,
"learning_rate": 5.6711663290882776e-05,
"loss": 11.8122,
"step": 234
},
{
"epoch": 0.6704384724186704,
"grad_norm": 0.08518210798501968,
"learning_rate": 5.575747226605298e-05,
"loss": 11.8164,
"step": 237
},
{
"epoch": 0.6789250353606789,
"grad_norm": 0.12115988880395889,
"learning_rate": 5.480115129538409e-05,
"loss": 11.8114,
"step": 240
},
{
"epoch": 0.6874115983026874,
"grad_norm": 0.13773323595523834,
"learning_rate": 5.384305416466584e-05,
"loss": 11.815,
"step": 243
},
{
"epoch": 0.695898161244696,
"grad_norm": 0.10262436419725418,
"learning_rate": 5.288353531676873e-05,
"loss": 11.8163,
"step": 246
},
{
"epoch": 0.7043847241867044,
"grad_norm": 0.12103616446256638,
"learning_rate": 5.192294972051992e-05,
"loss": 11.8176,
"step": 249
},
{
"epoch": 0.7128712871287128,
"grad_norm": 0.09017164260149002,
"learning_rate": 5.0961652739384356e-05,
"loss": 11.8221,
"step": 252
},
{
"epoch": 0.7128712871287128,
"eval_loss": 11.818002700805664,
"eval_runtime": 6.3716,
"eval_samples_per_second": 93.539,
"eval_steps_per_second": 11.771,
"step": 252
},
{
"epoch": 0.7213578500707214,
"grad_norm": 0.050791915506124496,
"learning_rate": 5e-05,
"loss": 11.8143,
"step": 255
},
{
"epoch": 0.7298444130127298,
"grad_norm": 0.1059621125459671,
"learning_rate": 4.903834726061565e-05,
"loss": 11.8175,
"step": 258
},
{
"epoch": 0.7383309759547383,
"grad_norm": 0.07910209894180298,
"learning_rate": 4.807705027948008e-05,
"loss": 11.8104,
"step": 261
},
{
"epoch": 0.7468175388967468,
"grad_norm": 0.11326657980680466,
"learning_rate": 4.711646468323129e-05,
"loss": 11.8178,
"step": 264
},
{
"epoch": 0.7553041018387553,
"grad_norm": 0.15818099677562714,
"learning_rate": 4.6156945835334184e-05,
"loss": 11.8152,
"step": 267
},
{
"epoch": 0.7637906647807637,
"grad_norm": 0.08803381770849228,
"learning_rate": 4.5198848704615914e-05,
"loss": 11.8165,
"step": 270
},
{
"epoch": 0.7722772277227723,
"grad_norm": 0.13214413821697235,
"learning_rate": 4.424252773394704e-05,
"loss": 11.8183,
"step": 273
},
{
"epoch": 0.7807637906647807,
"grad_norm": 0.0714588537812233,
"learning_rate": 4.328833670911724e-05,
"loss": 11.8207,
"step": 276
},
{
"epoch": 0.7892503536067893,
"grad_norm": 0.3612169325351715,
"learning_rate": 4.23366286279542e-05,
"loss": 11.81,
"step": 279
},
{
"epoch": 0.7977369165487977,
"grad_norm": 0.10489798337221146,
"learning_rate": 4.138775556973406e-05,
"loss": 11.8155,
"step": 282
},
{
"epoch": 0.8062234794908062,
"grad_norm": 0.07869977504014969,
"learning_rate": 4.04420685649314e-05,
"loss": 11.8183,
"step": 285
},
{
"epoch": 0.8147100424328148,
"grad_norm": 0.1785990297794342,
"learning_rate": 3.9499917465357534e-05,
"loss": 11.8151,
"step": 288
},
{
"epoch": 0.8231966053748232,
"grad_norm": 0.08096058666706085,
"learning_rate": 3.856165081473474e-05,
"loss": 11.8138,
"step": 291
},
{
"epoch": 0.8316831683168316,
"grad_norm": 0.11269628256559372,
"learning_rate": 3.762761571975429e-05,
"loss": 11.8173,
"step": 294
},
{
"epoch": 0.8316831683168316,
"eval_loss": 11.817937850952148,
"eval_runtime": 6.604,
"eval_samples_per_second": 90.248,
"eval_steps_per_second": 11.357,
"step": 294
},
{
"epoch": 0.8401697312588402,
"grad_norm": 0.11685353517532349,
"learning_rate": 3.6698157721666246e-05,
"loss": 11.8211,
"step": 297
},
{
"epoch": 0.8486562942008486,
"grad_norm": 0.10467521101236343,
"learning_rate": 3.5773620668448384e-05,
"loss": 11.8131,
"step": 300
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.08696059882640839,
"learning_rate": 3.48543465876014e-05,
"loss": 11.8151,
"step": 303
},
{
"epoch": 0.8656294200848657,
"grad_norm": 0.21422286331653595,
"learning_rate": 3.3940675559617724e-05,
"loss": 11.817,
"step": 306
},
{
"epoch": 0.8741159830268741,
"grad_norm": 0.09098684042692184,
"learning_rate": 3.303294559217063e-05,
"loss": 11.8177,
"step": 309
},
{
"epoch": 0.8826025459688827,
"grad_norm": 0.11003749072551727,
"learning_rate": 3.213149249506997e-05,
"loss": 11.8127,
"step": 312
},
{
"epoch": 0.8910891089108911,
"grad_norm": 0.10718018561601639,
"learning_rate": 3.12366497560313e-05,
"loss": 11.8152,
"step": 315
},
{
"epoch": 0.8995756718528995,
"grad_norm": 0.058545950800180435,
"learning_rate": 3.0348748417303823e-05,
"loss": 11.8117,
"step": 318
},
{
"epoch": 0.9080622347949081,
"grad_norm": 0.15917275846004486,
"learning_rate": 2.9468116953203107e-05,
"loss": 11.816,
"step": 321
},
{
"epoch": 0.9165487977369166,
"grad_norm": 0.08903782814741135,
"learning_rate": 2.8595081148593738e-05,
"loss": 11.8119,
"step": 324
},
{
"epoch": 0.925035360678925,
"grad_norm": 0.10488823056221008,
"learning_rate": 2.772996397836704e-05,
"loss": 11.8185,
"step": 327
},
{
"epoch": 0.9335219236209336,
"grad_norm": 0.09897799789905548,
"learning_rate": 2.687308548795825e-05,
"loss": 11.8194,
"step": 330
},
{
"epoch": 0.942008486562942,
"grad_norm": 0.10232014954090118,
"learning_rate": 2.6024762674947313e-05,
"loss": 11.8139,
"step": 333
},
{
"epoch": 0.9504950495049505,
"grad_norm": 0.07098989188671112,
"learning_rate": 2.5185309371787513e-05,
"loss": 11.8159,
"step": 336
},
{
"epoch": 0.9504950495049505,
"eval_loss": 11.817768096923828,
"eval_runtime": 6.6321,
"eval_samples_per_second": 89.866,
"eval_steps_per_second": 11.309,
"step": 336
},
{
"epoch": 0.958981612446959,
"grad_norm": 0.1141035333275795,
"learning_rate": 2.43550361297047e-05,
"loss": 11.8135,
"step": 339
},
{
"epoch": 0.9674681753889675,
"grad_norm": 0.15841831266880035,
"learning_rate": 2.353425010381063e-05,
"loss": 11.8191,
"step": 342
},
{
"epoch": 0.9759547383309759,
"grad_norm": 0.1034071147441864,
"learning_rate": 2.272325493947257e-05,
"loss": 11.8196,
"step": 345
},
{
"epoch": 0.9844413012729845,
"grad_norm": 0.1345403790473938,
"learning_rate": 2.192235065998126e-05,
"loss": 11.8162,
"step": 348
},
{
"epoch": 0.9929278642149929,
"grad_norm": 0.06921133399009705,
"learning_rate": 2.1131833555559037e-05,
"loss": 11.8178,
"step": 351
},
{
"epoch": 1.0014144271570014,
"grad_norm": 0.22766686975955963,
"learning_rate": 2.0351996073748713e-05,
"loss": 13.7864,
"step": 354
},
{
"epoch": 1.00990099009901,
"grad_norm": 0.15477143228054047,
"learning_rate": 1.9583126711224343e-05,
"loss": 11.817,
"step": 357
},
{
"epoch": 1.0183875530410185,
"grad_norm": 0.23170539736747742,
"learning_rate": 1.8825509907063327e-05,
"loss": 11.8147,
"step": 360
},
{
"epoch": 1.0268741159830268,
"grad_norm": 0.06929183006286621,
"learning_rate": 1.807942593751973e-05,
"loss": 11.8198,
"step": 363
},
{
"epoch": 1.0353606789250354,
"grad_norm": 0.15376947820186615,
"learning_rate": 1.7345150812337564e-05,
"loss": 11.8151,
"step": 366
},
{
"epoch": 1.043847241867044,
"grad_norm": 0.2093440443277359,
"learning_rate": 1.66229561726426e-05,
"loss": 11.8205,
"step": 369
},
{
"epoch": 1.0523338048090523,
"grad_norm": 0.1123834103345871,
"learning_rate": 1.5913109190450032e-05,
"loss": 11.8159,
"step": 372
},
{
"epoch": 1.0608203677510608,
"grad_norm": 0.22159625589847565,
"learning_rate": 1.5215872469825682e-05,
"loss": 11.8144,
"step": 375
},
{
"epoch": 1.0693069306930694,
"grad_norm": 0.12495719641447067,
"learning_rate": 1.4531503949737108e-05,
"loss": 11.8175,
"step": 378
},
{
"epoch": 1.0693069306930694,
"eval_loss": 11.817733764648438,
"eval_runtime": 6.6332,
"eval_samples_per_second": 89.851,
"eval_steps_per_second": 11.307,
"step": 378
},
{
"epoch": 1.0777934936350777,
"grad_norm": 0.12858428061008453,
"learning_rate": 1.3860256808630428e-05,
"loss": 11.8168,
"step": 381
},
{
"epoch": 1.0862800565770863,
"grad_norm": 0.1034870445728302,
"learning_rate": 1.3202379370768252e-05,
"loss": 11.8184,
"step": 384
},
{
"epoch": 1.0947666195190948,
"grad_norm": 0.19289083778858185,
"learning_rate": 1.2558115014363592e-05,
"loss": 11.8211,
"step": 387
},
{
"epoch": 1.1032531824611032,
"grad_norm": 0.15419146418571472,
"learning_rate": 1.1927702081543279e-05,
"loss": 11.8142,
"step": 390
},
{
"epoch": 1.1117397454031117,
"grad_norm": 0.15567056834697723,
"learning_rate": 1.1311373790174657e-05,
"loss": 11.8155,
"step": 393
},
{
"epoch": 1.1202263083451203,
"grad_norm": 0.13290712237358093,
"learning_rate": 1.0709358147587884e-05,
"loss": 11.8208,
"step": 396
},
{
"epoch": 1.1287128712871288,
"grad_norm": 0.09749293327331543,
"learning_rate": 1.0121877866225781e-05,
"loss": 11.8177,
"step": 399
},
{
"epoch": 1.1371994342291372,
"grad_norm": 0.120842345058918,
"learning_rate": 9.549150281252633e-06,
"loss": 11.8156,
"step": 402
},
{
"epoch": 1.1456859971711457,
"grad_norm": 0.09248703718185425,
"learning_rate": 8.991387270152201e-06,
"loss": 11.8186,
"step": 405
},
{
"epoch": 1.154172560113154,
"grad_norm": 0.12557213008403778,
"learning_rate": 8.448795174344804e-06,
"loss": 11.8199,
"step": 408
},
{
"epoch": 1.1626591230551626,
"grad_norm": 0.17817071080207825,
"learning_rate": 7.921574722852343e-06,
"loss": 11.8154,
"step": 411
},
{
"epoch": 1.1711456859971712,
"grad_norm": 0.10258757323026657,
"learning_rate": 7.409920958039795e-06,
"loss": 11.8124,
"step": 414
},
{
"epoch": 1.1796322489391797,
"grad_norm": 0.16313178837299347,
"learning_rate": 6.9140231634602485e-06,
"loss": 11.815,
"step": 417
},
{
"epoch": 1.188118811881188,
"grad_norm": 0.10050709545612335,
"learning_rate": 6.43406479383053e-06,
"loss": 11.8149,
"step": 420
},
{
"epoch": 1.188118811881188,
"eval_loss": 11.817734718322754,
"eval_runtime": 6.4421,
"eval_samples_per_second": 92.516,
"eval_steps_per_second": 11.642,
"step": 420
},
{
"epoch": 1.1966053748231966,
"grad_norm": 0.09405164420604706,
"learning_rate": 5.9702234071631e-06,
"loss": 11.8103,
"step": 423
},
{
"epoch": 1.2050919377652052,
"grad_norm": 0.07892228662967682,
"learning_rate": 5.5226705990794155e-06,
"loss": 11.8166,
"step": 426
},
{
"epoch": 1.2135785007072135,
"grad_norm": 0.09944994747638702,
"learning_rate": 5.091571939329048e-06,
"loss": 11.8157,
"step": 429
},
{
"epoch": 1.222065063649222,
"grad_norm": 0.09870926290750504,
"learning_rate": 4.677086910538092e-06,
"loss": 11.8141,
"step": 432
},
{
"epoch": 1.2305516265912306,
"grad_norm": 0.16009728610515594,
"learning_rate": 4.279368849209381e-06,
"loss": 11.8126,
"step": 435
},
{
"epoch": 1.239038189533239,
"grad_norm": 0.10148247331380844,
"learning_rate": 3.898564888996476e-06,
"loss": 11.8173,
"step": 438
},
{
"epoch": 1.2475247524752475,
"grad_norm": 0.057802699506282806,
"learning_rate": 3.534815906272404e-06,
"loss": 11.8136,
"step": 441
},
{
"epoch": 1.256011315417256,
"grad_norm": 0.059853699058294296,
"learning_rate": 3.18825646801314e-06,
"loss": 11.8178,
"step": 444
},
{
"epoch": 1.2644978783592644,
"grad_norm": 0.11142271012067795,
"learning_rate": 2.8590147820153513e-06,
"loss": 11.8121,
"step": 447
},
{
"epoch": 1.272984441301273,
"grad_norm": 0.12580835819244385,
"learning_rate": 2.547212649466568e-06,
"loss": 11.8092,
"step": 450
},
{
"epoch": 1.2814710042432815,
"grad_norm": 0.09984395653009415,
"learning_rate": 2.2529654198854835e-06,
"loss": 11.8177,
"step": 453
},
{
"epoch": 1.28995756718529,
"grad_norm": 0.06916932761669159,
"learning_rate": 1.9763819484490355e-06,
"loss": 11.8155,
"step": 456
},
{
"epoch": 1.2984441301272984,
"grad_norm": 0.15921108424663544,
"learning_rate": 1.7175645557220566e-06,
"loss": 11.813,
"step": 459
},
{
"epoch": 1.306930693069307,
"grad_norm": 0.08516174554824829,
"learning_rate": 1.4766089898042678e-06,
"loss": 11.81,
"step": 462
},
{
"epoch": 1.306930693069307,
"eval_loss": 11.817733764648438,
"eval_runtime": 6.4426,
"eval_samples_per_second": 92.509,
"eval_steps_per_second": 11.641,
"step": 462
},
{
"epoch": 1.3154172560113153,
"grad_norm": 0.09070340543985367,
"learning_rate": 1.2536043909088191e-06,
"loss": 11.8145,
"step": 465
},
{
"epoch": 1.3239038189533239,
"grad_norm": 0.0519736111164093,
"learning_rate": 1.0486332583853563e-06,
"loss": 11.8139,
"step": 468
},
{
"epoch": 1.3323903818953324,
"grad_norm": 0.09727457165718079,
"learning_rate": 8.617714201998084e-07,
"loss": 11.8145,
"step": 471
},
{
"epoch": 1.340876944837341,
"grad_norm": 0.0795770063996315,
"learning_rate": 6.93088004882253e-07,
"loss": 11.8154,
"step": 474
},
{
"epoch": 1.3493635077793493,
"grad_norm": 0.09474173188209534,
"learning_rate": 5.426454159531913e-07,
"loss": 11.8169,
"step": 477
},
{
"epoch": 1.3578500707213579,
"grad_norm": 0.13217829167842865,
"learning_rate": 4.104993088376974e-07,
"loss": 11.8165,
"step": 480
},
{
"epoch": 1.3663366336633662,
"grad_norm": 0.11465727537870407,
"learning_rate": 2.966985702759828e-07,
"loss": 11.8173,
"step": 483
},
{
"epoch": 1.3748231966053748,
"grad_norm": 0.12922833859920502,
"learning_rate": 2.012853002380466e-07,
"loss": 11.8154,
"step": 486
},
{
"epoch": 1.3833097595473833,
"grad_norm": 0.05712132155895233,
"learning_rate": 1.2429479634897267e-07,
"loss": 11.814,
"step": 489
},
{
"epoch": 1.391796322489392,
"grad_norm": 0.09231238812208176,
"learning_rate": 6.575554083078084e-08,
"loss": 11.8135,
"step": 492
},
{
"epoch": 1.4002828854314002,
"grad_norm": 0.08613143861293793,
"learning_rate": 2.568918996560532e-08,
"loss": 11.819,
"step": 495
},
{
"epoch": 1.4087694483734088,
"grad_norm": 0.11459596455097198,
"learning_rate": 4.110566084036816e-09,
"loss": 11.8119,
"step": 498
}
],
"logging_steps": 3,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 42,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3067553218560.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}