farmery's picture
Training in progress, step 500, checkpoint
4c699f8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6788866259334692,
"eval_steps": 125,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013577732518669382,
"eval_loss": 1.933146357536316,
"eval_runtime": 7.8521,
"eval_samples_per_second": 158.046,
"eval_steps_per_second": 19.867,
"step": 1
},
{
"epoch": 0.006788866259334691,
"grad_norm": 0.1681375950574875,
"learning_rate": 5e-05,
"loss": 1.7982,
"step": 5
},
{
"epoch": 0.013577732518669382,
"grad_norm": 0.18828649818897247,
"learning_rate": 0.0001,
"loss": 1.7923,
"step": 10
},
{
"epoch": 0.020366598778004074,
"grad_norm": 0.2104337513446808,
"learning_rate": 9.99743108100344e-05,
"loss": 1.8487,
"step": 15
},
{
"epoch": 0.027155465037338764,
"grad_norm": 0.2100468873977661,
"learning_rate": 9.989726963751682e-05,
"loss": 1.7487,
"step": 20
},
{
"epoch": 0.03394433129667346,
"grad_norm": 0.21163082122802734,
"learning_rate": 9.976895564745991e-05,
"loss": 1.8533,
"step": 25
},
{
"epoch": 0.04073319755600815,
"grad_norm": 0.23047874867916107,
"learning_rate": 9.95895006911623e-05,
"loss": 1.7436,
"step": 30
},
{
"epoch": 0.04752206381534284,
"grad_norm": 0.2215753197669983,
"learning_rate": 9.935908917072252e-05,
"loss": 1.7594,
"step": 35
},
{
"epoch": 0.05431093007467753,
"grad_norm": 0.25571998953819275,
"learning_rate": 9.907795784955327e-05,
"loss": 1.6957,
"step": 40
},
{
"epoch": 0.06109979633401222,
"grad_norm": 0.22950182855129242,
"learning_rate": 9.874639560909117e-05,
"loss": 1.6263,
"step": 45
},
{
"epoch": 0.06788866259334692,
"grad_norm": 0.24856191873550415,
"learning_rate": 9.836474315195147e-05,
"loss": 1.6548,
"step": 50
},
{
"epoch": 0.0746775288526816,
"grad_norm": 0.23954300582408905,
"learning_rate": 9.793339265183303e-05,
"loss": 1.6122,
"step": 55
},
{
"epoch": 0.0814663951120163,
"grad_norm": 0.21579493582248688,
"learning_rate": 9.745278735053343e-05,
"loss": 1.6289,
"step": 60
},
{
"epoch": 0.08825526137135098,
"grad_norm": 0.2218601107597351,
"learning_rate": 9.692342110248802e-05,
"loss": 1.6328,
"step": 65
},
{
"epoch": 0.09504412763068568,
"grad_norm": 0.23692508041858673,
"learning_rate": 9.63458378673011e-05,
"loss": 1.5982,
"step": 70
},
{
"epoch": 0.10183299389002037,
"grad_norm": 0.1912090927362442,
"learning_rate": 9.572063115079063e-05,
"loss": 1.6324,
"step": 75
},
{
"epoch": 0.10862186014935506,
"grad_norm": 0.23047535121440887,
"learning_rate": 9.504844339512095e-05,
"loss": 1.4911,
"step": 80
},
{
"epoch": 0.11541072640868975,
"grad_norm": 0.20515207946300507,
"learning_rate": 9.432996531865002e-05,
"loss": 1.5588,
"step": 85
},
{
"epoch": 0.12219959266802444,
"grad_norm": 0.21330514550209045,
"learning_rate": 9.356593520616948e-05,
"loss": 1.4904,
"step": 90
},
{
"epoch": 0.12898845892735913,
"grad_norm": 0.23363173007965088,
"learning_rate": 9.275713815026731e-05,
"loss": 1.6034,
"step": 95
},
{
"epoch": 0.13577732518669383,
"grad_norm": 0.17454369366168976,
"learning_rate": 9.190440524459203e-05,
"loss": 1.5246,
"step": 100
},
{
"epoch": 0.1425661914460285,
"grad_norm": 0.18648678064346313,
"learning_rate": 9.10086127298478e-05,
"loss": 1.5047,
"step": 105
},
{
"epoch": 0.1493550577053632,
"grad_norm": 0.19136574864387512,
"learning_rate": 9.007068109339784e-05,
"loss": 1.5417,
"step": 110
},
{
"epoch": 0.1561439239646979,
"grad_norm": 0.22987370193004608,
"learning_rate": 8.90915741234015e-05,
"loss": 1.5261,
"step": 115
},
{
"epoch": 0.1629327902240326,
"grad_norm": 0.20558615028858185,
"learning_rate": 8.807229791845673e-05,
"loss": 1.5108,
"step": 120
},
{
"epoch": 0.1697216564833673,
"grad_norm": 0.22430284321308136,
"learning_rate": 8.701389985376578e-05,
"loss": 1.533,
"step": 125
},
{
"epoch": 0.1697216564833673,
"eval_loss": 1.5494850873947144,
"eval_runtime": 7.7808,
"eval_samples_per_second": 159.495,
"eval_steps_per_second": 20.049,
"step": 125
},
{
"epoch": 0.17651052274270196,
"grad_norm": 0.20106786489486694,
"learning_rate": 8.591746750488639e-05,
"loss": 1.538,
"step": 130
},
{
"epoch": 0.18329938900203666,
"grad_norm": 0.21852199733257294,
"learning_rate": 8.478412753017433e-05,
"loss": 1.5041,
"step": 135
},
{
"epoch": 0.19008825526137135,
"grad_norm": 0.21438615024089813,
"learning_rate": 8.361504451306585e-05,
"loss": 1.4955,
"step": 140
},
{
"epoch": 0.19687712152070605,
"grad_norm": 0.26983126997947693,
"learning_rate": 8.241141976538943e-05,
"loss": 1.5475,
"step": 145
},
{
"epoch": 0.20366598778004075,
"grad_norm": 0.21069645881652832,
"learning_rate": 8.117449009293668e-05,
"loss": 1.5053,
"step": 150
},
{
"epoch": 0.21045485403937542,
"grad_norm": 0.19532069563865662,
"learning_rate": 7.990552652456081e-05,
"loss": 1.5054,
"step": 155
},
{
"epoch": 0.2172437202987101,
"grad_norm": 0.24983985722064972,
"learning_rate": 7.860583300610849e-05,
"loss": 1.4672,
"step": 160
},
{
"epoch": 0.2240325865580448,
"grad_norm": 0.23071178793907166,
"learning_rate": 7.727674506052743e-05,
"loss": 1.4741,
"step": 165
},
{
"epoch": 0.2308214528173795,
"grad_norm": 0.22894631326198578,
"learning_rate": 7.591962841552627e-05,
"loss": 1.5716,
"step": 170
},
{
"epoch": 0.23761031907671418,
"grad_norm": 0.2204715609550476,
"learning_rate": 7.45358776001969e-05,
"loss": 1.4912,
"step": 175
},
{
"epoch": 0.24439918533604887,
"grad_norm": 0.20580367743968964,
"learning_rate": 7.312691451204178e-05,
"loss": 1.5067,
"step": 180
},
{
"epoch": 0.25118805159538354,
"grad_norm": 0.22472696006298065,
"learning_rate": 7.169418695587791e-05,
"loss": 1.5726,
"step": 185
},
{
"epoch": 0.25797691785471827,
"grad_norm": 0.27027320861816406,
"learning_rate": 7.023916715611969e-05,
"loss": 1.4581,
"step": 190
},
{
"epoch": 0.26476578411405294,
"grad_norm": 0.23604366183280945,
"learning_rate": 6.876335024396872e-05,
"loss": 1.471,
"step": 195
},
{
"epoch": 0.27155465037338766,
"grad_norm": 0.2352292388677597,
"learning_rate": 6.726825272106538e-05,
"loss": 1.4378,
"step": 200
},
{
"epoch": 0.27834351663272233,
"grad_norm": 0.2463415563106537,
"learning_rate": 6.575541090118105e-05,
"loss": 1.4849,
"step": 205
},
{
"epoch": 0.285132382892057,
"grad_norm": 0.2296827733516693,
"learning_rate": 6.422637933155162e-05,
"loss": 1.3912,
"step": 210
},
{
"epoch": 0.2919212491513917,
"grad_norm": 0.2640281915664673,
"learning_rate": 6.268272919547537e-05,
"loss": 1.4078,
"step": 215
},
{
"epoch": 0.2987101154107264,
"grad_norm": 0.31660768389701843,
"learning_rate": 6.112604669781572e-05,
"loss": 1.4462,
"step": 220
},
{
"epoch": 0.3054989816700611,
"grad_norm": 0.22085396945476532,
"learning_rate": 5.955793143506863e-05,
"loss": 1.4574,
"step": 225
},
{
"epoch": 0.3122878479293958,
"grad_norm": 0.24354016780853271,
"learning_rate": 5.7979994751668964e-05,
"loss": 1.5296,
"step": 230
},
{
"epoch": 0.31907671418873046,
"grad_norm": 0.24986490607261658,
"learning_rate": 5.6393858084225305e-05,
"loss": 1.4725,
"step": 235
},
{
"epoch": 0.3258655804480652,
"grad_norm": 0.2531109154224396,
"learning_rate": 5.480115129538409e-05,
"loss": 1.4542,
"step": 240
},
{
"epoch": 0.33265444670739985,
"grad_norm": 0.19950221478939056,
"learning_rate": 5.320351099903565e-05,
"loss": 1.4582,
"step": 245
},
{
"epoch": 0.3394433129667346,
"grad_norm": 0.2342836707830429,
"learning_rate": 5.1602578878582776e-05,
"loss": 1.4385,
"step": 250
},
{
"epoch": 0.3394433129667346,
"eval_loss": 1.4789527654647827,
"eval_runtime": 7.722,
"eval_samples_per_second": 160.71,
"eval_steps_per_second": 20.202,
"step": 250
},
{
"epoch": 0.34623217922606925,
"grad_norm": 0.22694501280784607,
"learning_rate": 5e-05,
"loss": 1.4285,
"step": 255
},
{
"epoch": 0.3530210454854039,
"grad_norm": 0.23925504088401794,
"learning_rate": 4.839742112141724e-05,
"loss": 1.4588,
"step": 260
},
{
"epoch": 0.35980991174473864,
"grad_norm": 0.2735687494277954,
"learning_rate": 4.679648900096436e-05,
"loss": 1.4691,
"step": 265
},
{
"epoch": 0.3665987780040733,
"grad_norm": 0.29210373759269714,
"learning_rate": 4.5198848704615914e-05,
"loss": 1.441,
"step": 270
},
{
"epoch": 0.37338764426340804,
"grad_norm": 0.2414471060037613,
"learning_rate": 4.3606141915774693e-05,
"loss": 1.4356,
"step": 275
},
{
"epoch": 0.3801765105227427,
"grad_norm": 0.23838582634925842,
"learning_rate": 4.2020005248331054e-05,
"loss": 1.4412,
"step": 280
},
{
"epoch": 0.3869653767820774,
"grad_norm": 0.22352631390094757,
"learning_rate": 4.04420685649314e-05,
"loss": 1.459,
"step": 285
},
{
"epoch": 0.3937542430414121,
"grad_norm": 0.25490501523017883,
"learning_rate": 3.887395330218429e-05,
"loss": 1.4208,
"step": 290
},
{
"epoch": 0.40054310930074677,
"grad_norm": 0.26434803009033203,
"learning_rate": 3.731727080452464e-05,
"loss": 1.4264,
"step": 295
},
{
"epoch": 0.4073319755600815,
"grad_norm": 0.23617196083068848,
"learning_rate": 3.5773620668448384e-05,
"loss": 1.3935,
"step": 300
},
{
"epoch": 0.41412084181941616,
"grad_norm": 0.23081578314304352,
"learning_rate": 3.424458909881897e-05,
"loss": 1.4113,
"step": 305
},
{
"epoch": 0.42090970807875083,
"grad_norm": 0.24254988133907318,
"learning_rate": 3.273174727893463e-05,
"loss": 1.4908,
"step": 310
},
{
"epoch": 0.42769857433808556,
"grad_norm": 0.21009749174118042,
"learning_rate": 3.12366497560313e-05,
"loss": 1.4607,
"step": 315
},
{
"epoch": 0.4344874405974202,
"grad_norm": 0.2565441131591797,
"learning_rate": 2.976083284388031e-05,
"loss": 1.4664,
"step": 320
},
{
"epoch": 0.4412763068567549,
"grad_norm": 0.2448188215494156,
"learning_rate": 2.8305813044122097e-05,
"loss": 1.4369,
"step": 325
},
{
"epoch": 0.4480651731160896,
"grad_norm": 0.19102215766906738,
"learning_rate": 2.687308548795825e-05,
"loss": 1.3941,
"step": 330
},
{
"epoch": 0.4548540393754243,
"grad_norm": 0.23744595050811768,
"learning_rate": 2.5464122399803125e-05,
"loss": 1.4053,
"step": 335
},
{
"epoch": 0.461642905634759,
"grad_norm": 0.25465700030326843,
"learning_rate": 2.4080371584473748e-05,
"loss": 1.4054,
"step": 340
},
{
"epoch": 0.4684317718940937,
"grad_norm": 0.26104259490966797,
"learning_rate": 2.272325493947257e-05,
"loss": 1.4511,
"step": 345
},
{
"epoch": 0.47522063815342835,
"grad_norm": 0.22542956471443176,
"learning_rate": 2.139416699389153e-05,
"loss": 1.4673,
"step": 350
},
{
"epoch": 0.4820095044127631,
"grad_norm": 0.2605678141117096,
"learning_rate": 2.0094473475439202e-05,
"loss": 1.3932,
"step": 355
},
{
"epoch": 0.48879837067209775,
"grad_norm": 0.24848726391792297,
"learning_rate": 1.8825509907063327e-05,
"loss": 1.4204,
"step": 360
},
{
"epoch": 0.4955872369314325,
"grad_norm": 0.2906915843486786,
"learning_rate": 1.758858023461059e-05,
"loss": 1.3734,
"step": 365
},
{
"epoch": 0.5023761031907671,
"grad_norm": 0.25691258907318115,
"learning_rate": 1.6384955486934156e-05,
"loss": 1.4086,
"step": 370
},
{
"epoch": 0.5091649694501018,
"grad_norm": 0.2475077509880066,
"learning_rate": 1.5215872469825682e-05,
"loss": 1.4393,
"step": 375
},
{
"epoch": 0.5091649694501018,
"eval_loss": 1.4588358402252197,
"eval_runtime": 7.7286,
"eval_samples_per_second": 160.573,
"eval_steps_per_second": 20.185,
"step": 375
},
{
"epoch": 0.5159538357094365,
"grad_norm": 0.3304094076156616,
"learning_rate": 1.4082532495113626e-05,
"loss": 1.4469,
"step": 380
},
{
"epoch": 0.5227427019687713,
"grad_norm": 0.2766299545764923,
"learning_rate": 1.2986100146234232e-05,
"loss": 1.357,
"step": 385
},
{
"epoch": 0.5295315682281059,
"grad_norm": 0.29647424817085266,
"learning_rate": 1.1927702081543279e-05,
"loss": 1.47,
"step": 390
},
{
"epoch": 0.5363204344874406,
"grad_norm": 0.2745124399662018,
"learning_rate": 1.090842587659851e-05,
"loss": 1.398,
"step": 395
},
{
"epoch": 0.5431093007467753,
"grad_norm": 0.2885884642601013,
"learning_rate": 9.929318906602175e-06,
"loss": 1.4597,
"step": 400
},
{
"epoch": 0.5498981670061099,
"grad_norm": 0.24886402487754822,
"learning_rate": 8.991387270152201e-06,
"loss": 1.4669,
"step": 405
},
{
"epoch": 0.5566870332654447,
"grad_norm": 0.2334742695093155,
"learning_rate": 8.09559475540797e-06,
"loss": 1.4107,
"step": 410
},
{
"epoch": 0.5634758995247794,
"grad_norm": 0.27737849950790405,
"learning_rate": 7.242861849732696e-06,
"loss": 1.5048,
"step": 415
},
{
"epoch": 0.570264765784114,
"grad_norm": 0.23328500986099243,
"learning_rate": 6.43406479383053e-06,
"loss": 1.4487,
"step": 420
},
{
"epoch": 0.5770536320434487,
"grad_norm": 0.32235902547836304,
"learning_rate": 5.670034681349995e-06,
"loss": 1.4791,
"step": 425
},
{
"epoch": 0.5838424983027835,
"grad_norm": 0.3034408688545227,
"learning_rate": 4.951556604879048e-06,
"loss": 1.4643,
"step": 430
},
{
"epoch": 0.5906313645621182,
"grad_norm": 0.24768030643463135,
"learning_rate": 4.279368849209381e-06,
"loss": 1.4803,
"step": 435
},
{
"epoch": 0.5974202308214528,
"grad_norm": 0.23609156906604767,
"learning_rate": 3.654162132698918e-06,
"loss": 1.3877,
"step": 440
},
{
"epoch": 0.6042090970807875,
"grad_norm": 0.25662508606910706,
"learning_rate": 3.076578897511978e-06,
"loss": 1.4758,
"step": 445
},
{
"epoch": 0.6109979633401222,
"grad_norm": 0.24194221198558807,
"learning_rate": 2.547212649466568e-06,
"loss": 1.4027,
"step": 450
},
{
"epoch": 0.6177868295994569,
"grad_norm": 0.3545863628387451,
"learning_rate": 2.066607348166971e-06,
"loss": 1.4078,
"step": 455
},
{
"epoch": 0.6245756958587916,
"grad_norm": 0.2740108072757721,
"learning_rate": 1.6352568480485276e-06,
"loss": 1.4463,
"step": 460
},
{
"epoch": 0.6313645621181263,
"grad_norm": 0.2730073630809784,
"learning_rate": 1.2536043909088191e-06,
"loss": 1.4089,
"step": 465
},
{
"epoch": 0.6381534283774609,
"grad_norm": 0.2379709631204605,
"learning_rate": 9.220421504467281e-07,
"loss": 1.4428,
"step": 470
},
{
"epoch": 0.6449422946367956,
"grad_norm": 0.2579050362110138,
"learning_rate": 6.409108292774913e-07,
"loss": 1.4112,
"step": 475
},
{
"epoch": 0.6517311608961304,
"grad_norm": 0.2765941917896271,
"learning_rate": 4.104993088376974e-07,
"loss": 1.4722,
"step": 480
},
{
"epoch": 0.658520027155465,
"grad_norm": 0.27427244186401367,
"learning_rate": 2.310443525400885e-07,
"loss": 1.3907,
"step": 485
},
{
"epoch": 0.6653088934147997,
"grad_norm": 0.2611384689807892,
"learning_rate": 1.0273036248318324e-07,
"loss": 1.4292,
"step": 490
},
{
"epoch": 0.6720977596741344,
"grad_norm": 0.2796425223350525,
"learning_rate": 2.568918996560532e-08,
"loss": 1.4264,
"step": 495
},
{
"epoch": 0.6788866259334692,
"grad_norm": 0.21907183527946472,
"learning_rate": 0.0,
"loss": 1.4451,
"step": 500
},
{
"epoch": 0.6788866259334692,
"eval_loss": 1.4559390544891357,
"eval_runtime": 7.7312,
"eval_samples_per_second": 160.519,
"eval_steps_per_second": 20.178,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 125,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0682976996163584e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}