IeBoytsov's picture
Model save
832d07d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 352,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005681818181818182,
"grad_norm": 61.450797180202606,
"learning_rate": 5.555555555555555e-07,
"loss": 2.2212,
"step": 1
},
{
"epoch": 0.028409090909090908,
"grad_norm": 64.39059584640545,
"learning_rate": 2.7777777777777783e-06,
"loss": 2.1229,
"step": 5
},
{
"epoch": 0.056818181818181816,
"grad_norm": 6.91974733739602,
"learning_rate": 5.555555555555557e-06,
"loss": 1.6002,
"step": 10
},
{
"epoch": 0.08522727272727272,
"grad_norm": 4.765316545239553,
"learning_rate": 8.333333333333334e-06,
"loss": 1.0834,
"step": 15
},
{
"epoch": 0.11363636363636363,
"grad_norm": 2.7902519705279327,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.8354,
"step": 20
},
{
"epoch": 0.14204545454545456,
"grad_norm": 1.4800453018498552,
"learning_rate": 1.388888888888889e-05,
"loss": 0.763,
"step": 25
},
{
"epoch": 0.17045454545454544,
"grad_norm": 1.1251067723566994,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7216,
"step": 30
},
{
"epoch": 0.19886363636363635,
"grad_norm": 1.0559075058285379,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.7044,
"step": 35
},
{
"epoch": 0.22727272727272727,
"grad_norm": 1.2534613751009074,
"learning_rate": 1.999209397227302e-05,
"loss": 0.6903,
"step": 40
},
{
"epoch": 0.2556818181818182,
"grad_norm": 1.6289307968117395,
"learning_rate": 1.995999715857997e-05,
"loss": 0.6679,
"step": 45
},
{
"epoch": 0.2840909090909091,
"grad_norm": 1.2476804276032436,
"learning_rate": 1.9903294664725023e-05,
"loss": 0.6615,
"step": 50
},
{
"epoch": 0.3125,
"grad_norm": 0.9819151769054784,
"learning_rate": 1.9822126571413616e-05,
"loss": 0.6576,
"step": 55
},
{
"epoch": 0.3409090909090909,
"grad_norm": 0.7874872470633155,
"learning_rate": 1.97166934004041e-05,
"loss": 0.644,
"step": 60
},
{
"epoch": 0.3693181818181818,
"grad_norm": 0.8685235909784279,
"learning_rate": 1.9587255619128648e-05,
"loss": 0.6417,
"step": 65
},
{
"epoch": 0.3977272727272727,
"grad_norm": 0.8355500376299562,
"learning_rate": 1.9434132997221347e-05,
"loss": 0.6415,
"step": 70
},
{
"epoch": 0.42613636363636365,
"grad_norm": 0.7019935692437175,
"learning_rate": 1.9257703816543144e-05,
"loss": 0.6351,
"step": 75
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.7150838153313119,
"learning_rate": 1.9058403936655235e-05,
"loss": 0.6301,
"step": 80
},
{
"epoch": 0.48295454545454547,
"grad_norm": 0.777998409699531,
"learning_rate": 1.8836725718049562e-05,
"loss": 0.6323,
"step": 85
},
{
"epoch": 0.5113636363636364,
"grad_norm": 0.9386535132438508,
"learning_rate": 1.8593216805796612e-05,
"loss": 0.6262,
"step": 90
},
{
"epoch": 0.5397727272727273,
"grad_norm": 0.7833891503314497,
"learning_rate": 1.8328478776615336e-05,
"loss": 0.6226,
"step": 95
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.6233679945069268,
"learning_rate": 1.804316565270765e-05,
"loss": 0.6215,
"step": 100
},
{
"epoch": 0.5965909090909091,
"grad_norm": 0.6035859026208489,
"learning_rate": 1.7737982286028938e-05,
"loss": 0.6145,
"step": 105
},
{
"epoch": 0.625,
"grad_norm": 0.6777356473856782,
"learning_rate": 1.7413682616986185e-05,
"loss": 0.6131,
"step": 110
},
{
"epoch": 0.6534090909090909,
"grad_norm": 0.6612579810251372,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.6126,
"step": 115
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.5914172959469909,
"learning_rate": 1.671098428359037e-05,
"loss": 0.6109,
"step": 120
},
{
"epoch": 0.7102272727272727,
"grad_norm": 0.6559176879020109,
"learning_rate": 1.6334321600700612e-05,
"loss": 0.6117,
"step": 125
},
{
"epoch": 0.7386363636363636,
"grad_norm": 0.6684982270631625,
"learning_rate": 1.5942010289717108e-05,
"loss": 0.6076,
"step": 130
},
{
"epoch": 0.7670454545454546,
"grad_norm": 0.6372604947073411,
"learning_rate": 1.5535019536322158e-05,
"loss": 0.6027,
"step": 135
},
{
"epoch": 0.7954545454545454,
"grad_norm": 0.6994385308149875,
"learning_rate": 1.5114354791034225e-05,
"loss": 0.6101,
"step": 140
},
{
"epoch": 0.8238636363636364,
"grad_norm": 0.595747522856983,
"learning_rate": 1.4681055285292138e-05,
"loss": 0.6029,
"step": 145
},
{
"epoch": 0.8522727272727273,
"grad_norm": 0.5480963244425426,
"learning_rate": 1.4236191464085286e-05,
"loss": 0.6015,
"step": 150
},
{
"epoch": 0.8806818181818182,
"grad_norm": 0.5398374925046976,
"learning_rate": 1.3780862341472183e-05,
"loss": 0.601,
"step": 155
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.5638894303413493,
"learning_rate": 1.331619278552068e-05,
"loss": 0.597,
"step": 160
},
{
"epoch": 0.9375,
"grad_norm": 0.6681163362607668,
"learning_rate": 1.2843330739377003e-05,
"loss": 0.5955,
"step": 165
},
{
"epoch": 0.9659090909090909,
"grad_norm": 0.5783323357337038,
"learning_rate": 1.2363444385329052e-05,
"loss": 0.5938,
"step": 170
},
{
"epoch": 0.9943181818181818,
"grad_norm": 0.5688860290477228,
"learning_rate": 1.1877719258869827e-05,
"loss": 0.5958,
"step": 175
},
{
"epoch": 1.0,
"eval_loss": 0.6028689742088318,
"eval_runtime": 4.9572,
"eval_samples_per_second": 70.806,
"eval_steps_per_second": 1.21,
"step": 176
},
{
"epoch": 1.0227272727272727,
"grad_norm": 0.5776530983447593,
"learning_rate": 1.1387355319890685e-05,
"loss": 0.5715,
"step": 180
},
{
"epoch": 1.0511363636363635,
"grad_norm": 0.5351985628101088,
"learning_rate": 1.0893563988239773e-05,
"loss": 0.5662,
"step": 185
},
{
"epoch": 1.0795454545454546,
"grad_norm": 0.555723989781456,
"learning_rate": 1.039756515096926e-05,
"loss": 0.5641,
"step": 190
},
{
"epoch": 1.1079545454545454,
"grad_norm": 0.5820199823963064,
"learning_rate": 9.900584148664705e-06,
"loss": 0.5663,
"step": 195
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.4816152928348661,
"learning_rate": 9.403848748301802e-06,
"loss": 0.5604,
"step": 200
},
{
"epoch": 1.1647727272727273,
"grad_norm": 0.5345678354575093,
"learning_rate": 8.908586110108794e-06,
"loss": 0.5643,
"step": 205
},
{
"epoch": 1.1931818181818181,
"grad_norm": 0.5415303447950298,
"learning_rate": 8.416019755927851e-06,
"loss": 0.5612,
"step": 210
},
{
"epoch": 1.2215909090909092,
"grad_norm": 0.5423360313378033,
"learning_rate": 7.927366546564911e-06,
"loss": 0.5615,
"step": 215
},
{
"epoch": 1.25,
"grad_norm": 0.4718754739107113,
"learning_rate": 7.443833675595254e-06,
"loss": 0.5606,
"step": 220
},
{
"epoch": 1.2784090909090908,
"grad_norm": 0.4930197840237037,
"learning_rate": 6.966615687051517e-06,
"loss": 0.5637,
"step": 225
},
{
"epoch": 1.3068181818181819,
"grad_norm": 0.48706485409767547,
"learning_rate": 6.496891524361757e-06,
"loss": 0.5628,
"step": 230
},
{
"epoch": 1.3352272727272727,
"grad_norm": 0.4651113637210652,
"learning_rate": 6.03582161782806e-06,
"loss": 0.5632,
"step": 235
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.5580807947723885,
"learning_rate": 5.584545017840886e-06,
"loss": 0.5592,
"step": 240
},
{
"epoch": 1.3920454545454546,
"grad_norm": 0.5164154281834604,
"learning_rate": 5.144176580911431e-06,
"loss": 0.5569,
"step": 245
},
{
"epoch": 1.4204545454545454,
"grad_norm": 0.5214361592191502,
"learning_rate": 4.7158042154738094e-06,
"loss": 0.5568,
"step": 250
},
{
"epoch": 1.4488636363636362,
"grad_norm": 0.4690952469954382,
"learning_rate": 4.3004861942610575e-06,
"loss": 0.5555,
"step": 255
},
{
"epoch": 1.4772727272727273,
"grad_norm": 0.4614475658830548,
"learning_rate": 3.899248539894756e-06,
"loss": 0.5577,
"step": 260
},
{
"epoch": 1.5056818181818183,
"grad_norm": 0.4420892345248393,
"learning_rate": 3.513082490146864e-06,
"loss": 0.554,
"step": 265
},
{
"epoch": 1.5340909090909092,
"grad_norm": 0.4142734694806993,
"learning_rate": 3.1429420491358696e-06,
"loss": 0.5552,
"step": 270
},
{
"epoch": 1.5625,
"grad_norm": 0.4431041177417911,
"learning_rate": 2.7897416305068325e-06,
"loss": 0.5533,
"step": 275
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.43134061646610855,
"learning_rate": 2.454353798417698e-06,
"loss": 0.5526,
"step": 280
},
{
"epoch": 1.6193181818181817,
"grad_norm": 0.4436282928131185,
"learning_rate": 2.137607111912734e-06,
"loss": 0.5516,
"step": 285
},
{
"epoch": 1.6477272727272727,
"grad_norm": 0.42887307203464214,
"learning_rate": 1.840284078008393e-06,
"loss": 0.5529,
"step": 290
},
{
"epoch": 1.6761363636363638,
"grad_norm": 0.4325445800220451,
"learning_rate": 1.5631192185484557e-06,
"loss": 0.5509,
"step": 295
},
{
"epoch": 1.7045454545454546,
"grad_norm": 0.42115802401305363,
"learning_rate": 1.3067972556041753e-06,
"loss": 0.5542,
"step": 300
},
{
"epoch": 1.7329545454545454,
"grad_norm": 0.41199771612659136,
"learning_rate": 1.0719514199022473e-06,
"loss": 0.5524,
"step": 305
},
{
"epoch": 1.7613636363636362,
"grad_norm": 0.3982652388798851,
"learning_rate": 8.591618864596541e-07,
"loss": 0.5516,
"step": 310
},
{
"epoch": 1.7897727272727273,
"grad_norm": 0.43027763841489375,
"learning_rate": 6.689543412899913e-07,
"loss": 0.5529,
"step": 315
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.4194862616952164,
"learning_rate": 5.017986827221733e-07,
"loss": 0.5511,
"step": 320
},
{
"epoch": 1.8465909090909092,
"grad_norm": 0.3966942779222067,
"learning_rate": 3.5810786053987025e-07,
"loss": 0.5494,
"step": 325
},
{
"epoch": 1.875,
"grad_norm": 0.41819439445707,
"learning_rate": 2.3823685580949273e-07,
"loss": 0.5546,
"step": 330
},
{
"epoch": 1.9034090909090908,
"grad_norm": 0.40593144742056747,
"learning_rate": 1.4248180391703614e-07,
"loss": 0.547,
"step": 335
},
{
"epoch": 1.9318181818181817,
"grad_norm": 0.4100887183324941,
"learning_rate": 7.10792629802659e-08,
"loss": 0.5527,
"step": 340
},
{
"epoch": 1.9602272727272727,
"grad_norm": 0.4017736667114781,
"learning_rate": 2.420562944358329e-08,
"loss": 0.55,
"step": 345
},
{
"epoch": 1.9886363636363638,
"grad_norm": 0.3955471531873619,
"learning_rate": 1.9767022993444353e-09,
"loss": 0.5494,
"step": 350
},
{
"epoch": 2.0,
"eval_loss": 0.5863133072853088,
"eval_runtime": 4.9616,
"eval_samples_per_second": 70.744,
"eval_steps_per_second": 1.209,
"step": 352
},
{
"epoch": 2.0,
"step": 352,
"total_flos": 73701638799360.0,
"train_loss": 0.6391877403313463,
"train_runtime": 1025.7787,
"train_samples_per_second": 21.89,
"train_steps_per_second": 0.343
}
],
"logging_steps": 5,
"max_steps": 352,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 73701638799360.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}