Mistral-Crab-SFT / trainer_state.json
Kikkk's picture
commit
75e22b8
raw
history blame
11.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 612,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006535947712418301,
"grad_norm": 11.154335021972656,
"learning_rate": 8.064516129032259e-08,
"loss": 1.1172,
"step": 1
},
{
"epoch": 0.06535947712418301,
"grad_norm": 5.521430969238281,
"learning_rate": 8.064516129032258e-07,
"loss": 1.0538,
"step": 10
},
{
"epoch": 0.13071895424836602,
"grad_norm": 2.362562417984009,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.9643,
"step": 20
},
{
"epoch": 0.19607843137254902,
"grad_norm": 2.3052921295166016,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.9044,
"step": 30
},
{
"epoch": 0.26143790849673204,
"grad_norm": 1.4959567785263062,
"learning_rate": 3.225806451612903e-06,
"loss": 0.8643,
"step": 40
},
{
"epoch": 0.32679738562091504,
"grad_norm": 1.5943924188613892,
"learning_rate": 4.032258064516129e-06,
"loss": 0.8507,
"step": 50
},
{
"epoch": 0.39215686274509803,
"grad_norm": 1.6096584796905518,
"learning_rate": 4.838709677419355e-06,
"loss": 0.8438,
"step": 60
},
{
"epoch": 0.45751633986928103,
"grad_norm": 1.4947034120559692,
"learning_rate": 4.997390310845578e-06,
"loss": 0.8257,
"step": 70
},
{
"epoch": 0.5228758169934641,
"grad_norm": 1.7357966899871826,
"learning_rate": 4.986797785768296e-06,
"loss": 0.832,
"step": 80
},
{
"epoch": 0.5882352941176471,
"grad_norm": 2.4221644401550293,
"learning_rate": 4.968093843200407e-06,
"loss": 0.8168,
"step": 90
},
{
"epoch": 0.6535947712418301,
"grad_norm": 1.916853427886963,
"learning_rate": 4.9413394915149094e-06,
"loss": 0.8077,
"step": 100
},
{
"epoch": 0.7189542483660131,
"grad_norm": 1.7931920289993286,
"learning_rate": 4.9066219978460485e-06,
"loss": 0.7937,
"step": 110
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.5507404804229736,
"learning_rate": 4.864054603442063e-06,
"loss": 0.784,
"step": 120
},
{
"epoch": 0.8496732026143791,
"grad_norm": 1.6783676147460938,
"learning_rate": 4.813776154295767e-06,
"loss": 0.7874,
"step": 130
},
{
"epoch": 0.9150326797385621,
"grad_norm": 1.4907172918319702,
"learning_rate": 4.755950648257789e-06,
"loss": 0.7858,
"step": 140
},
{
"epoch": 0.9803921568627451,
"grad_norm": 1.4448139667510986,
"learning_rate": 4.690766700109659e-06,
"loss": 0.7849,
"step": 150
},
{
"epoch": 1.0457516339869282,
"grad_norm": 1.9426017999649048,
"learning_rate": 4.618436926341607e-06,
"loss": 0.6916,
"step": 160
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.858111023902893,
"learning_rate": 4.5391972516417545e-06,
"loss": 0.6377,
"step": 170
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.7554138898849487,
"learning_rate": 4.453306139358828e-06,
"loss": 0.6431,
"step": 180
},
{
"epoch": 1.2418300653594772,
"grad_norm": 1.6084789037704468,
"learning_rate": 4.36104374844843e-06,
"loss": 0.6474,
"step": 190
},
{
"epoch": 1.3071895424836601,
"grad_norm": 1.7224164009094238,
"learning_rate": 4.262711019652764e-06,
"loss": 0.6372,
"step": 200
},
{
"epoch": 1.3725490196078431,
"grad_norm": 1.5711984634399414,
"learning_rate": 4.15862869389448e-06,
"loss": 0.6379,
"step": 210
},
{
"epoch": 1.4379084967320261,
"grad_norm": 1.7631185054779053,
"learning_rate": 4.049136266086453e-06,
"loss": 0.6302,
"step": 220
},
{
"epoch": 1.5032679738562091,
"grad_norm": 1.8376095294952393,
"learning_rate": 3.934590877769944e-06,
"loss": 0.6378,
"step": 230
},
{
"epoch": 1.5686274509803921,
"grad_norm": 2.0489087104797363,
"learning_rate": 3.815366152193122e-06,
"loss": 0.6164,
"step": 240
},
{
"epoch": 1.6339869281045751,
"grad_norm": 1.8818341493606567,
"learning_rate": 3.6918509756296876e-06,
"loss": 0.6284,
"step": 250
},
{
"epoch": 1.6993464052287581,
"grad_norm": 1.636940598487854,
"learning_rate": 3.564448228912682e-06,
"loss": 0.6223,
"step": 260
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.699742078781128,
"learning_rate": 3.4335734733209457e-06,
"loss": 0.6212,
"step": 270
},
{
"epoch": 1.8300653594771243,
"grad_norm": 1.6848982572555542,
"learning_rate": 3.299653595104603e-06,
"loss": 0.6241,
"step": 280
},
{
"epoch": 1.8954248366013071,
"grad_norm": 1.8798364400863647,
"learning_rate": 3.1631254130708446e-06,
"loss": 0.6149,
"step": 290
},
{
"epoch": 1.9607843137254903,
"grad_norm": 2.14373517036438,
"learning_rate": 3.0244342537717735e-06,
"loss": 0.6124,
"step": 300
},
{
"epoch": 2.026143790849673,
"grad_norm": 2.7039153575897217,
"learning_rate": 2.8840324989417488e-06,
"loss": 0.5466,
"step": 310
},
{
"epoch": 2.0915032679738563,
"grad_norm": 2.615293025970459,
"learning_rate": 2.742378109922204e-06,
"loss": 0.4731,
"step": 320
},
{
"epoch": 2.156862745098039,
"grad_norm": 2.0649566650390625,
"learning_rate": 2.599933133886934e-06,
"loss": 0.4673,
"step": 330
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.7854645252227783,
"learning_rate": 2.457162196740252e-06,
"loss": 0.4639,
"step": 340
},
{
"epoch": 2.287581699346405,
"grad_norm": 1.954106330871582,
"learning_rate": 2.31453098760387e-06,
"loss": 0.4732,
"step": 350
},
{
"epoch": 2.3529411764705883,
"grad_norm": 1.7365140914916992,
"learning_rate": 2.1725047398357677e-06,
"loss": 0.468,
"step": 360
},
{
"epoch": 2.418300653594771,
"grad_norm": 1.9597340822219849,
"learning_rate": 2.031546713535688e-06,
"loss": 0.4646,
"step": 370
},
{
"epoch": 2.4836601307189543,
"grad_norm": 1.8259657621383667,
"learning_rate": 1.8921166844869762e-06,
"loss": 0.4584,
"step": 380
},
{
"epoch": 2.549019607843137,
"grad_norm": 1.9516103267669678,
"learning_rate": 1.7546694444635394e-06,
"loss": 0.4644,
"step": 390
},
{
"epoch": 2.6143790849673203,
"grad_norm": 1.826661229133606,
"learning_rate": 1.6196533177936132e-06,
"loss": 0.4674,
"step": 400
},
{
"epoch": 2.6797385620915035,
"grad_norm": 1.6825226545333862,
"learning_rate": 1.487508699018987e-06,
"loss": 0.4614,
"step": 410
},
{
"epoch": 2.7450980392156863,
"grad_norm": 1.7974435091018677,
"learning_rate": 1.358666616419544e-06,
"loss": 0.4676,
"step": 420
},
{
"epoch": 2.810457516339869,
"grad_norm": 1.6989048719406128,
"learning_rate": 1.2335473260886046e-06,
"loss": 0.4496,
"step": 430
},
{
"epoch": 2.8758169934640523,
"grad_norm": 1.7996526956558228,
"learning_rate": 1.1125589411448996e-06,
"loss": 0.4597,
"step": 440
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.746968388557434,
"learning_rate": 9.960961005524033e-07,
"loss": 0.4532,
"step": 450
},
{
"epoch": 3.0065359477124183,
"grad_norm": 6.076014995574951,
"learning_rate": 8.845386818900647e-07,
"loss": 0.4454,
"step": 460
},
{
"epoch": 3.0718954248366015,
"grad_norm": 2.6995885372161865,
"learning_rate": 7.782505622700964e-07,
"loss": 0.3719,
"step": 470
},
{
"epoch": 3.1372549019607843,
"grad_norm": 2.025956869125366,
"learning_rate": 6.775784314464717e-07,
"loss": 0.3699,
"step": 480
},
{
"epoch": 3.2026143790849675,
"grad_norm": 1.8959600925445557,
"learning_rate": 5.828506609850054e-07,
"loss": 0.3585,
"step": 490
},
{
"epoch": 3.2679738562091503,
"grad_norm": 1.9729666709899902,
"learning_rate": 4.943762331835622e-07,
"loss": 0.3579,
"step": 500
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.8463507890701294,
"learning_rate": 4.1244373323601874e-07,
"loss": 0.3572,
"step": 510
},
{
"epoch": 3.3986928104575163,
"grad_norm": 1.870890498161316,
"learning_rate": 3.3732040792734734e-07,
"loss": 0.3609,
"step": 520
},
{
"epoch": 3.4640522875816995,
"grad_norm": 1.8788135051727295,
"learning_rate": 2.6925129393015196e-07,
"loss": 0.3621,
"step": 530
},
{
"epoch": 3.5294117647058822,
"grad_norm": 1.8110865354537964,
"learning_rate": 2.0845841854597092e-07,
"loss": 0.3544,
"step": 540
},
{
"epoch": 3.5947712418300655,
"grad_norm": 1.8861949443817139,
"learning_rate": 1.5514007549836979e-07,
"loss": 0.3617,
"step": 550
},
{
"epoch": 3.6601307189542482,
"grad_norm": 1.814979910850525,
"learning_rate": 1.0947017814003258e-07,
"loss": 0.3664,
"step": 560
},
{
"epoch": 3.7254901960784315,
"grad_norm": 1.8255079984664917,
"learning_rate": 7.159769218354873e-08,
"loss": 0.3603,
"step": 570
},
{
"epoch": 3.7908496732026142,
"grad_norm": 1.795508623123169,
"learning_rate": 4.164614980622678e-08,
"loss": 0.3604,
"step": 580
},
{
"epoch": 3.8562091503267975,
"grad_norm": 1.8044642210006714,
"learning_rate": 1.9713246713805588e-08,
"loss": 0.3551,
"step": 590
},
{
"epoch": 3.9215686274509802,
"grad_norm": 1.8935906887054443,
"learning_rate": 5.87052347736844e-09,
"loss": 0.3599,
"step": 600
},
{
"epoch": 3.9869281045751634,
"grad_norm": 1.862278938293457,
"learning_rate": 1.6313218287128396e-10,
"loss": 0.3593,
"step": 610
},
{
"epoch": 4.0,
"step": 612,
"total_flos": 1.3695396597968404e+19,
"train_loss": 0.5748976978406407,
"train_runtime": 13335.1785,
"train_samples_per_second": 11.743,
"train_steps_per_second": 0.046
}
],
"logging_steps": 10,
"max_steps": 612,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3695396597968404e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}