IeBoytsov's picture
Model save
df90a4e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 808,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024752475247524753,
"grad_norm": 38.88609991114745,
"learning_rate": 2.469135802469136e-07,
"loss": 1.8604,
"step": 1
},
{
"epoch": 0.012376237623762377,
"grad_norm": 41.27939914036261,
"learning_rate": 1.234567901234568e-06,
"loss": 1.8561,
"step": 5
},
{
"epoch": 0.024752475247524754,
"grad_norm": 5.095317934955837,
"learning_rate": 2.469135802469136e-06,
"loss": 1.6195,
"step": 10
},
{
"epoch": 0.03712871287128713,
"grad_norm": 2.761116338631256,
"learning_rate": 3.7037037037037037e-06,
"loss": 1.3811,
"step": 15
},
{
"epoch": 0.04950495049504951,
"grad_norm": 1.6927067748367093,
"learning_rate": 4.938271604938272e-06,
"loss": 1.1736,
"step": 20
},
{
"epoch": 0.06188118811881188,
"grad_norm": 1.637997489475765,
"learning_rate": 6.17283950617284e-06,
"loss": 1.0247,
"step": 25
},
{
"epoch": 0.07425742574257425,
"grad_norm": 0.9921453622709451,
"learning_rate": 7.4074074074074075e-06,
"loss": 1.0275,
"step": 30
},
{
"epoch": 0.08663366336633663,
"grad_norm": 0.9409275181475548,
"learning_rate": 8.641975308641975e-06,
"loss": 0.9832,
"step": 35
},
{
"epoch": 0.09900990099009901,
"grad_norm": 1.0325275844641013,
"learning_rate": 9.876543209876543e-06,
"loss": 0.9685,
"step": 40
},
{
"epoch": 0.11138613861386139,
"grad_norm": 1.1218773632200145,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.9528,
"step": 45
},
{
"epoch": 0.12376237623762376,
"grad_norm": 0.9572180193224005,
"learning_rate": 1.234567901234568e-05,
"loss": 0.9201,
"step": 50
},
{
"epoch": 0.13613861386138615,
"grad_norm": 1.0009650827934435,
"learning_rate": 1.3580246913580248e-05,
"loss": 0.9542,
"step": 55
},
{
"epoch": 0.1485148514851485,
"grad_norm": 1.3065165374147407,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.9437,
"step": 60
},
{
"epoch": 0.1608910891089109,
"grad_norm": 1.0022123167697776,
"learning_rate": 1.6049382716049385e-05,
"loss": 0.9228,
"step": 65
},
{
"epoch": 0.17326732673267325,
"grad_norm": 1.2184130543835685,
"learning_rate": 1.728395061728395e-05,
"loss": 0.9772,
"step": 70
},
{
"epoch": 0.18564356435643564,
"grad_norm": 0.9408942641640691,
"learning_rate": 1.851851851851852e-05,
"loss": 0.9284,
"step": 75
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.9712584145776969,
"learning_rate": 1.9753086419753087e-05,
"loss": 0.9484,
"step": 80
},
{
"epoch": 0.2103960396039604,
"grad_norm": 0.9980192527761912,
"learning_rate": 1.999850613931615e-05,
"loss": 0.9536,
"step": 85
},
{
"epoch": 0.22277227722772278,
"grad_norm": 0.9890748697192007,
"learning_rate": 1.9992438095219886e-05,
"loss": 0.9539,
"step": 90
},
{
"epoch": 0.23514851485148514,
"grad_norm": 0.9155960993755162,
"learning_rate": 1.9981705331953295e-05,
"loss": 0.936,
"step": 95
},
{
"epoch": 0.24752475247524752,
"grad_norm": 0.8051611539712368,
"learning_rate": 1.996631285983779e-05,
"loss": 0.9456,
"step": 100
},
{
"epoch": 0.2599009900990099,
"grad_norm": 0.9230091255662991,
"learning_rate": 1.9946267864463027e-05,
"loss": 0.9136,
"step": 105
},
{
"epoch": 0.2722772277227723,
"grad_norm": 0.8382334460112006,
"learning_rate": 1.9921579703332475e-05,
"loss": 0.93,
"step": 110
},
{
"epoch": 0.28465346534653463,
"grad_norm": 0.8774240428827245,
"learning_rate": 1.989225990149512e-05,
"loss": 0.9245,
"step": 115
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.8987251464829349,
"learning_rate": 1.9858322146165272e-05,
"loss": 0.9341,
"step": 120
},
{
"epoch": 0.3094059405940594,
"grad_norm": 0.9049306018652525,
"learning_rate": 1.981978228033304e-05,
"loss": 0.9257,
"step": 125
},
{
"epoch": 0.3217821782178218,
"grad_norm": 0.9029524926074908,
"learning_rate": 1.977665829536842e-05,
"loss": 0.916,
"step": 130
},
{
"epoch": 0.3341584158415842,
"grad_norm": 0.8591727043131795,
"learning_rate": 1.9728970322622485e-05,
"loss": 0.9168,
"step": 135
},
{
"epoch": 0.3465346534653465,
"grad_norm": 0.8712548282993846,
"learning_rate": 1.9676740624029566e-05,
"loss": 0.9347,
"step": 140
},
{
"epoch": 0.3589108910891089,
"grad_norm": 0.8442378198048982,
"learning_rate": 1.961999358171482e-05,
"loss": 0.9286,
"step": 145
},
{
"epoch": 0.3712871287128713,
"grad_norm": 0.8448891617644472,
"learning_rate": 1.955875568661206e-05,
"loss": 0.9247,
"step": 150
},
{
"epoch": 0.38366336633663367,
"grad_norm": 0.8506188708401377,
"learning_rate": 1.94930555260971e-05,
"loss": 0.9089,
"step": 155
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.7615649650166222,
"learning_rate": 1.9422923770642494e-05,
"loss": 0.9121,
"step": 160
},
{
"epoch": 0.4084158415841584,
"grad_norm": 0.8438076926475072,
"learning_rate": 1.934839315949976e-05,
"loss": 0.9133,
"step": 165
},
{
"epoch": 0.4207920792079208,
"grad_norm": 0.9246754527016784,
"learning_rate": 1.9269498485415897e-05,
"loss": 0.9298,
"step": 170
},
{
"epoch": 0.43316831683168316,
"grad_norm": 0.9627433039386685,
"learning_rate": 1.9186276578391268e-05,
"loss": 0.9421,
"step": 175
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.8087976752191404,
"learning_rate": 1.9098766288486426e-05,
"loss": 0.9261,
"step": 180
},
{
"epoch": 0.45792079207920794,
"grad_norm": 0.774521835464554,
"learning_rate": 1.9007008467685947e-05,
"loss": 0.9328,
"step": 185
},
{
"epoch": 0.47029702970297027,
"grad_norm": 0.7725284205468596,
"learning_rate": 1.8911045950827693e-05,
"loss": 0.9093,
"step": 190
},
{
"epoch": 0.48267326732673266,
"grad_norm": 0.8054030551821266,
"learning_rate": 1.881092353560646e-05,
"loss": 0.9013,
"step": 195
},
{
"epoch": 0.49504950495049505,
"grad_norm": 0.7758400337225813,
"learning_rate": 1.870668796166129e-05,
"loss": 0.9142,
"step": 200
},
{
"epoch": 0.5074257425742574,
"grad_norm": 0.8158556574805117,
"learning_rate": 1.8598387888756224e-05,
"loss": 0.9222,
"step": 205
},
{
"epoch": 0.5198019801980198,
"grad_norm": 0.7595417385088049,
"learning_rate": 1.8486073874064745e-05,
"loss": 0.9061,
"step": 210
},
{
"epoch": 0.5321782178217822,
"grad_norm": 0.7472799231884507,
"learning_rate": 1.8369798348568403e-05,
"loss": 0.9083,
"step": 215
},
{
"epoch": 0.5445544554455446,
"grad_norm": 0.7606289538163604,
"learning_rate": 1.8249615592580733e-05,
"loss": 0.9328,
"step": 220
},
{
"epoch": 0.556930693069307,
"grad_norm": 0.805186253739664,
"learning_rate": 1.8125581710407864e-05,
"loss": 0.9138,
"step": 225
},
{
"epoch": 0.5693069306930693,
"grad_norm": 0.767139039892534,
"learning_rate": 1.7997754604157607e-05,
"loss": 0.9075,
"step": 230
},
{
"epoch": 0.5816831683168316,
"grad_norm": 0.7674924368854081,
"learning_rate": 1.786619394670933e-05,
"loss": 0.9094,
"step": 235
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.7416157345263308,
"learning_rate": 1.7730961153857155e-05,
"loss": 0.9146,
"step": 240
},
{
"epoch": 0.6064356435643564,
"grad_norm": 0.7972512834290945,
"learning_rate": 1.7592119355639545e-05,
"loss": 0.8986,
"step": 245
},
{
"epoch": 0.6188118811881188,
"grad_norm": 0.7413943937305464,
"learning_rate": 1.744973336686862e-05,
"loss": 0.9261,
"step": 250
},
{
"epoch": 0.6311881188118812,
"grad_norm": 0.7288799636758897,
"learning_rate": 1.7303869656872994e-05,
"loss": 0.9004,
"step": 255
},
{
"epoch": 0.6435643564356436,
"grad_norm": 0.7554557666231643,
"learning_rate": 1.715459631846824e-05,
"loss": 0.9097,
"step": 260
},
{
"epoch": 0.655940594059406,
"grad_norm": 0.7267320540898269,
"learning_rate": 1.700198303616944e-05,
"loss": 0.902,
"step": 265
},
{
"epoch": 0.6683168316831684,
"grad_norm": 0.6823040717527771,
"learning_rate": 1.684610105366076e-05,
"loss": 0.8913,
"step": 270
},
{
"epoch": 0.6806930693069307,
"grad_norm": 0.6787438356187852,
"learning_rate": 1.6687023140537082e-05,
"loss": 0.882,
"step": 275
},
{
"epoch": 0.693069306930693,
"grad_norm": 0.7420049831517054,
"learning_rate": 1.6524823558333362e-05,
"loss": 0.8985,
"step": 280
},
{
"epoch": 0.7054455445544554,
"grad_norm": 0.7627405635266166,
"learning_rate": 1.6359578025857495e-05,
"loss": 0.8836,
"step": 285
},
{
"epoch": 0.7178217821782178,
"grad_norm": 0.8778998231303615,
"learning_rate": 1.6191363683842883e-05,
"loss": 0.8871,
"step": 290
},
{
"epoch": 0.7301980198019802,
"grad_norm": 0.8167069067691852,
"learning_rate": 1.6020259058937228e-05,
"loss": 0.8866,
"step": 295
},
{
"epoch": 0.7425742574257426,
"grad_norm": 0.7980390861743758,
"learning_rate": 1.5846344027044307e-05,
"loss": 0.9047,
"step": 300
},
{
"epoch": 0.754950495049505,
"grad_norm": 0.7563819762216695,
"learning_rate": 1.5669699776035958e-05,
"loss": 0.921,
"step": 305
},
{
"epoch": 0.7673267326732673,
"grad_norm": 0.7797762078681911,
"learning_rate": 1.5490408767851506e-05,
"loss": 0.8869,
"step": 310
},
{
"epoch": 0.7797029702970297,
"grad_norm": 0.7666252485199382,
"learning_rate": 1.530855470000251e-05,
"loss": 0.9151,
"step": 315
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.7051563046414802,
"learning_rate": 1.5124222466500665e-05,
"loss": 0.9024,
"step": 320
},
{
"epoch": 0.8044554455445545,
"grad_norm": 0.7203691895762679,
"learning_rate": 1.4937498118227156e-05,
"loss": 0.9098,
"step": 325
},
{
"epoch": 0.8168316831683168,
"grad_norm": 0.7125293698622911,
"learning_rate": 1.4748468822761974e-05,
"loss": 0.9076,
"step": 330
},
{
"epoch": 0.8292079207920792,
"grad_norm": 0.7428102709759479,
"learning_rate": 1.4557222823691913e-05,
"loss": 0.9082,
"step": 335
},
{
"epoch": 0.8415841584158416,
"grad_norm": 0.7230219909417678,
"learning_rate": 1.4363849399416254e-05,
"loss": 0.9004,
"step": 340
},
{
"epoch": 0.8539603960396039,
"grad_norm": 0.788928114591889,
"learning_rate": 1.4168438821469402e-05,
"loss": 0.8845,
"step": 345
},
{
"epoch": 0.8663366336633663,
"grad_norm": 0.7456926963995498,
"learning_rate": 1.3971082312379864e-05,
"loss": 0.9013,
"step": 350
},
{
"epoch": 0.8787128712871287,
"grad_norm": 0.7757067249831519,
"learning_rate": 1.3771872003085315e-05,
"loss": 0.8913,
"step": 355
},
{
"epoch": 0.8910891089108911,
"grad_norm": 0.7040237849520753,
"learning_rate": 1.3570900889923566e-05,
"loss": 0.9178,
"step": 360
},
{
"epoch": 0.9034653465346535,
"grad_norm": 0.7028463213970978,
"learning_rate": 1.3368262791219568e-05,
"loss": 0.8864,
"step": 365
},
{
"epoch": 0.9158415841584159,
"grad_norm": 0.6865489126472716,
"learning_rate": 1.3164052303488673e-05,
"loss": 0.8958,
"step": 370
},
{
"epoch": 0.9282178217821783,
"grad_norm": 0.7124826316517546,
"learning_rate": 1.2958364757276616e-05,
"loss": 0.8927,
"step": 375
},
{
"epoch": 0.9405940594059405,
"grad_norm": 0.71875227320249,
"learning_rate": 1.2751296172656862e-05,
"loss": 0.897,
"step": 380
},
{
"epoch": 0.9529702970297029,
"grad_norm": 0.7298632107047348,
"learning_rate": 1.2542943214406012e-05,
"loss": 0.9051,
"step": 385
},
{
"epoch": 0.9653465346534653,
"grad_norm": 0.6885898822393058,
"learning_rate": 1.23334031468783e-05,
"loss": 0.8546,
"step": 390
},
{
"epoch": 0.9777227722772277,
"grad_norm": 0.7078778884772863,
"learning_rate": 1.2122773788600164e-05,
"loss": 0.9019,
"step": 395
},
{
"epoch": 0.9900990099009901,
"grad_norm": 0.7533302944336393,
"learning_rate": 1.1911153466606105e-05,
"loss": 0.895,
"step": 400
},
{
"epoch": 1.0,
"eval_loss": 1.06302809715271,
"eval_runtime": 43.2706,
"eval_samples_per_second": 75.871,
"eval_steps_per_second": 1.202,
"step": 404
},
{
"epoch": 1.0024752475247525,
"grad_norm": 1.3960472434954319,
"learning_rate": 1.1698640970537195e-05,
"loss": 0.8479,
"step": 405
},
{
"epoch": 1.0148514851485149,
"grad_norm": 0.8222898474391185,
"learning_rate": 1.14853355065236e-05,
"loss": 0.7492,
"step": 410
},
{
"epoch": 1.0272277227722773,
"grad_norm": 1.146533181549737,
"learning_rate": 1.1271336650872687e-05,
"loss": 0.7352,
"step": 415
},
{
"epoch": 1.0396039603960396,
"grad_norm": 1.2500390362477898,
"learning_rate": 1.1056744303584322e-05,
"loss": 0.7107,
"step": 420
},
{
"epoch": 1.051980198019802,
"grad_norm": 0.8882156471951722,
"learning_rate": 1.0841658641715064e-05,
"loss": 0.7027,
"step": 425
},
{
"epoch": 1.0643564356435644,
"grad_norm": 0.8783518588023339,
"learning_rate": 1.0626180072613011e-05,
"loss": 0.7199,
"step": 430
},
{
"epoch": 1.0767326732673268,
"grad_norm": 0.7956936034755372,
"learning_rate": 1.0410409187045145e-05,
"loss": 0.6972,
"step": 435
},
{
"epoch": 1.0891089108910892,
"grad_norm": 0.7957451930016578,
"learning_rate": 1.0194446712239076e-05,
"loss": 0.7194,
"step": 440
},
{
"epoch": 1.1014851485148516,
"grad_norm": 0.8085539993808099,
"learning_rate": 9.978393464861036e-06,
"loss": 0.7085,
"step": 445
},
{
"epoch": 1.113861386138614,
"grad_norm": 0.7629426599829563,
"learning_rate": 9.76235030395215e-06,
"loss": 0.7221,
"step": 450
},
{
"epoch": 1.1262376237623761,
"grad_norm": 0.7099439929036686,
"learning_rate": 9.546418083844944e-06,
"loss": 0.7228,
"step": 455
},
{
"epoch": 1.1386138613861387,
"grad_norm": 0.768143602885412,
"learning_rate": 9.330697607081995e-06,
"loss": 0.7055,
"step": 460
},
{
"epoch": 1.150990099009901,
"grad_norm": 0.7399932448725586,
"learning_rate": 9.115289577358826e-06,
"loss": 0.7126,
"step": 465
},
{
"epoch": 1.1633663366336633,
"grad_norm": 0.7720020506192959,
"learning_rate": 8.900294552512878e-06,
"loss": 0.7095,
"step": 470
},
{
"epoch": 1.1757425742574257,
"grad_norm": 0.7452407090275405,
"learning_rate": 8.68581289758063e-06,
"loss": 0.7028,
"step": 475
},
{
"epoch": 1.188118811881188,
"grad_norm": 0.7571246453363882,
"learning_rate": 8.471944737944687e-06,
"loss": 0.7184,
"step": 480
},
{
"epoch": 1.2004950495049505,
"grad_norm": 0.7494453898237526,
"learning_rate": 8.25878991259276e-06,
"loss": 0.6982,
"step": 485
},
{
"epoch": 1.2128712871287128,
"grad_norm": 0.7254138409040641,
"learning_rate": 8.046447927510335e-06,
"loss": 0.7175,
"step": 490
},
{
"epoch": 1.2252475247524752,
"grad_norm": 0.7709180774648441,
"learning_rate": 7.835017909228801e-06,
"loss": 0.7075,
"step": 495
},
{
"epoch": 1.2376237623762376,
"grad_norm": 0.760113272357165,
"learning_rate": 7.624598558550707e-06,
"loss": 0.7224,
"step": 500
},
{
"epoch": 1.25,
"grad_norm": 0.7251746178009261,
"learning_rate": 7.415288104473774e-06,
"loss": 0.7059,
"step": 505
},
{
"epoch": 1.2623762376237624,
"grad_norm": 0.7604221994294873,
"learning_rate": 7.207184258335163e-06,
"loss": 0.7022,
"step": 510
},
{
"epoch": 1.2747524752475248,
"grad_norm": 0.7947902441474564,
"learning_rate": 7.000384168197354e-06,
"loss": 0.7076,
"step": 515
},
{
"epoch": 1.2871287128712872,
"grad_norm": 0.7054449499686205,
"learning_rate": 6.7949843734970475e-06,
"loss": 0.7133,
"step": 520
},
{
"epoch": 1.2995049504950495,
"grad_norm": 0.708545411017501,
"learning_rate": 6.5910807599781135e-06,
"loss": 0.7106,
"step": 525
},
{
"epoch": 1.311881188118812,
"grad_norm": 0.7262363459114348,
"learning_rate": 6.388768514929768e-06,
"loss": 0.7114,
"step": 530
},
{
"epoch": 1.3242574257425743,
"grad_norm": 0.6998186137578639,
"learning_rate": 6.18814208275075e-06,
"loss": 0.7122,
"step": 535
},
{
"epoch": 1.3366336633663367,
"grad_norm": 0.7095860494984699,
"learning_rate": 5.989295120860334e-06,
"loss": 0.7252,
"step": 540
},
{
"epoch": 1.349009900990099,
"grad_norm": 0.6915165561447487,
"learning_rate": 5.792320455976714e-06,
"loss": 0.7048,
"step": 545
},
{
"epoch": 1.3613861386138613,
"grad_norm": 0.7119909691342722,
"learning_rate": 5.597310040783161e-06,
"loss": 0.6962,
"step": 550
},
{
"epoch": 1.3737623762376239,
"grad_norm": 0.7504970044336818,
"learning_rate": 5.404354911002243e-06,
"loss": 0.707,
"step": 555
},
{
"epoch": 1.386138613861386,
"grad_norm": 0.7146866752363458,
"learning_rate": 5.213545142898061e-06,
"loss": 0.7223,
"step": 560
},
{
"epoch": 1.3985148514851486,
"grad_norm": 0.7356980622491117,
"learning_rate": 5.024969811226419e-06,
"loss": 0.6998,
"step": 565
},
{
"epoch": 1.4108910891089108,
"grad_norm": 0.7061674308512749,
"learning_rate": 4.838716947652485e-06,
"loss": 0.6958,
"step": 570
},
{
"epoch": 1.4232673267326732,
"grad_norm": 0.7284088021218859,
"learning_rate": 4.654873499655449e-06,
"loss": 0.6964,
"step": 575
},
{
"epoch": 1.4356435643564356,
"grad_norm": 0.6886931942419792,
"learning_rate": 4.4735252899392335e-06,
"loss": 0.7023,
"step": 580
},
{
"epoch": 1.448019801980198,
"grad_norm": 0.7166576506174707,
"learning_rate": 4.294756976368351e-06,
"loss": 0.7069,
"step": 585
},
{
"epoch": 1.4603960396039604,
"grad_norm": 0.7032864356512767,
"learning_rate": 4.118652012447486e-06,
"loss": 0.7211,
"step": 590
},
{
"epoch": 1.4727722772277227,
"grad_norm": 0.7242483728321348,
"learning_rate": 3.945292608363312e-06,
"loss": 0.7119,
"step": 595
},
{
"epoch": 1.4851485148514851,
"grad_norm": 0.6918844982337927,
"learning_rate": 3.7747596926067485e-06,
"loss": 0.7221,
"step": 600
},
{
"epoch": 1.4975247524752475,
"grad_norm": 0.7132050642871574,
"learning_rate": 3.6071328741934985e-06,
"loss": 0.7022,
"step": 605
},
{
"epoch": 1.50990099009901,
"grad_norm": 0.7347264550601227,
"learning_rate": 3.442490405500598e-06,
"loss": 0.6975,
"step": 610
},
{
"epoch": 1.5222772277227723,
"grad_norm": 0.6602507777994507,
"learning_rate": 3.2809091457362464e-06,
"loss": 0.7065,
"step": 615
},
{
"epoch": 1.5346534653465347,
"grad_norm": 0.6740627682629335,
"learning_rate": 3.122464525060013e-06,
"loss": 0.6978,
"step": 620
},
{
"epoch": 1.547029702970297,
"grad_norm": 0.7142221920558823,
"learning_rate": 2.96723050937015e-06,
"loss": 0.709,
"step": 625
},
{
"epoch": 1.5594059405940595,
"grad_norm": 0.7335799725765082,
"learning_rate": 2.8152795657744882e-06,
"loss": 0.6893,
"step": 630
},
{
"epoch": 1.5717821782178216,
"grad_norm": 0.7032104638603995,
"learning_rate": 2.666682628760958e-06,
"loss": 0.6961,
"step": 635
},
{
"epoch": 1.5841584158415842,
"grad_norm": 0.6921897923437655,
"learning_rate": 2.521509067083631e-06,
"loss": 0.6938,
"step": 640
},
{
"epoch": 1.5965346534653464,
"grad_norm": 0.6863228632248336,
"learning_rate": 2.379826651379632e-06,
"loss": 0.7033,
"step": 645
},
{
"epoch": 1.608910891089109,
"grad_norm": 0.6778666475373859,
"learning_rate": 2.241701522532136e-06,
"loss": 0.7077,
"step": 650
},
{
"epoch": 1.6212871287128712,
"grad_norm": 0.978566748682469,
"learning_rate": 2.107198160794136e-06,
"loss": 0.7102,
"step": 655
},
{
"epoch": 1.6336633663366338,
"grad_norm": 0.7103004954900359,
"learning_rate": 1.9763793556874655e-06,
"loss": 0.6983,
"step": 660
},
{
"epoch": 1.646039603960396,
"grad_norm": 0.676195777058198,
"learning_rate": 1.849306176691088e-06,
"loss": 0.7176,
"step": 665
},
{
"epoch": 1.6584158415841586,
"grad_norm": 0.7079808939239,
"learning_rate": 1.7260379447323327e-06,
"loss": 0.6998,
"step": 670
},
{
"epoch": 1.6707920792079207,
"grad_norm": 0.6910118308885802,
"learning_rate": 1.6066322044944126e-06,
"loss": 0.6847,
"step": 675
},
{
"epoch": 1.6831683168316833,
"grad_norm": 0.6904569392825494,
"learning_rate": 1.4911446975531329e-06,
"loss": 0.7014,
"step": 680
},
{
"epoch": 1.6955445544554455,
"grad_norm": 0.6979836145792466,
"learning_rate": 1.3796293363553259e-06,
"loss": 0.7252,
"step": 685
},
{
"epoch": 1.7079207920792079,
"grad_norm": 0.6898195721719471,
"learning_rate": 1.2721381790511832e-06,
"loss": 0.7096,
"step": 690
},
{
"epoch": 1.7202970297029703,
"grad_norm": 0.7872710836498652,
"learning_rate": 1.168721405192218e-06,
"loss": 0.7118,
"step": 695
},
{
"epoch": 1.7326732673267327,
"grad_norm": 0.6351928901398578,
"learning_rate": 1.0694272923061933e-06,
"loss": 0.7073,
"step": 700
},
{
"epoch": 1.745049504950495,
"grad_norm": 0.6550150479810684,
"learning_rate": 9.743021933599695e-07,
"loss": 0.6879,
"step": 705
},
{
"epoch": 1.7574257425742574,
"grad_norm": 0.6762595095414403,
"learning_rate": 8.833905151207833e-07,
"loss": 0.6972,
"step": 710
},
{
"epoch": 1.7698019801980198,
"grad_norm": 0.7046793104440422,
"learning_rate": 7.967346974260626e-07,
"loss": 0.7119,
"step": 715
},
{
"epoch": 1.7821782178217822,
"grad_norm": 0.6803563399348503,
"learning_rate": 7.143751933714583e-07,
"loss": 0.7064,
"step": 720
},
{
"epoch": 1.7945544554455446,
"grad_norm": 0.6569063272572411,
"learning_rate": 6.363504504263207e-07,
"loss": 0.694,
"step": 725
},
{
"epoch": 1.806930693069307,
"grad_norm": 0.672487356470194,
"learning_rate": 5.626968924854714e-07,
"loss": 0.7133,
"step": 730
},
{
"epoch": 1.8193069306930694,
"grad_norm": 0.6612411849360775,
"learning_rate": 4.934489028656164e-07,
"loss": 0.6933,
"step": 735
},
{
"epoch": 1.8316831683168315,
"grad_norm": 0.7156254594176831,
"learning_rate": 4.2863880825435687e-07,
"loss": 0.7057,
"step": 740
},
{
"epoch": 1.8440594059405941,
"grad_norm": 0.6833411888683371,
"learning_rate": 3.682968636192863e-07,
"loss": 0.7112,
"step": 745
},
{
"epoch": 1.8564356435643563,
"grad_norm": 0.6969326586876218,
"learning_rate": 3.124512380842204e-07,
"loss": 0.6912,
"step": 750
},
{
"epoch": 1.868811881188119,
"grad_norm": 0.6742873155450232,
"learning_rate": 2.61128001779144e-07,
"loss": 0.6823,
"step": 755
},
{
"epoch": 1.881188118811881,
"grad_norm": 0.6414949104474531,
"learning_rate": 2.1435111367002826e-07,
"loss": 0.7126,
"step": 760
},
{
"epoch": 1.8935643564356437,
"grad_norm": 0.6525583867438978,
"learning_rate": 1.7214241037418312e-07,
"loss": 0.6969,
"step": 765
},
{
"epoch": 1.9059405940594059,
"grad_norm": 0.65521211994519,
"learning_rate": 1.345215959663837e-07,
"loss": 0.6903,
"step": 770
},
{
"epoch": 1.9183168316831685,
"grad_norm": 0.6944361037558262,
"learning_rate": 1.0150623278051719e-07,
"loss": 0.6912,
"step": 775
},
{
"epoch": 1.9306930693069306,
"grad_norm": 0.648743159889204,
"learning_rate": 7.311173321104648e-08,
"loss": 0.683,
"step": 780
},
{
"epoch": 1.943069306930693,
"grad_norm": 0.6925205112702544,
"learning_rate": 4.935135251811995e-08,
"loss": 0.6934,
"step": 785
},
{
"epoch": 1.9554455445544554,
"grad_norm": 0.6504666964480196,
"learning_rate": 3.023618263968797e-08,
"loss": 0.7186,
"step": 790
},
{
"epoch": 1.9678217821782178,
"grad_norm": 0.7265261251377658,
"learning_rate": 1.577514701350591e-08,
"loss": 0.7216,
"step": 795
},
{
"epoch": 1.9801980198019802,
"grad_norm": 0.6739382260611008,
"learning_rate": 5.97499641145416e-09,
"loss": 0.6928,
"step": 800
},
{
"epoch": 1.9925742574257426,
"grad_norm": 0.6660006595388417,
"learning_rate": 8.403057881067877e-10,
"loss": 0.6941,
"step": 805
},
{
"epoch": 2.0,
"eval_loss": 1.07817542552948,
"eval_runtime": 43.2932,
"eval_samples_per_second": 75.832,
"eval_steps_per_second": 1.201,
"step": 808
},
{
"epoch": 2.0,
"step": 808,
"total_flos": 169178761789440.0,
"train_loss": 0.8271688128461933,
"train_runtime": 2411.6836,
"train_samples_per_second": 21.427,
"train_steps_per_second": 0.335
}
],
"logging_steps": 5,
"max_steps": 808,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 169178761789440.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}