|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 612, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006535947712418301, |
|
"grad_norm": 11.154335021972656, |
|
"learning_rate": 8.064516129032259e-08, |
|
"loss": 1.1172, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06535947712418301, |
|
"grad_norm": 5.521430969238281, |
|
"learning_rate": 8.064516129032258e-07, |
|
"loss": 1.0538, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13071895424836602, |
|
"grad_norm": 2.362562417984009, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.9643, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 2.3052921295166016, |
|
"learning_rate": 2.4193548387096776e-06, |
|
"loss": 0.9044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.26143790849673204, |
|
"grad_norm": 1.4959567785263062, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.8643, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.32679738562091504, |
|
"grad_norm": 1.5943924188613892, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.8507, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 1.6096584796905518, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 0.8438, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.45751633986928103, |
|
"grad_norm": 1.4947034120559692, |
|
"learning_rate": 4.997390310845578e-06, |
|
"loss": 0.8257, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5228758169934641, |
|
"grad_norm": 1.7357966899871826, |
|
"learning_rate": 4.986797785768296e-06, |
|
"loss": 0.832, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 2.4221644401550293, |
|
"learning_rate": 4.968093843200407e-06, |
|
"loss": 0.8168, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6535947712418301, |
|
"grad_norm": 1.916853427886963, |
|
"learning_rate": 4.9413394915149094e-06, |
|
"loss": 0.8077, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7189542483660131, |
|
"grad_norm": 1.7931920289993286, |
|
"learning_rate": 4.9066219978460485e-06, |
|
"loss": 0.7937, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 1.5507404804229736, |
|
"learning_rate": 4.864054603442063e-06, |
|
"loss": 0.784, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8496732026143791, |
|
"grad_norm": 1.6783676147460938, |
|
"learning_rate": 4.813776154295767e-06, |
|
"loss": 0.7874, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9150326797385621, |
|
"grad_norm": 1.4907172918319702, |
|
"learning_rate": 4.755950648257789e-06, |
|
"loss": 0.7858, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9803921568627451, |
|
"grad_norm": 1.4448139667510986, |
|
"learning_rate": 4.690766700109659e-06, |
|
"loss": 0.7849, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0457516339869282, |
|
"grad_norm": 1.9426017999649048, |
|
"learning_rate": 4.618436926341607e-06, |
|
"loss": 0.6916, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 1.858111023902893, |
|
"learning_rate": 4.5391972516417545e-06, |
|
"loss": 0.6377, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 1.7554138898849487, |
|
"learning_rate": 4.453306139358828e-06, |
|
"loss": 0.6431, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2418300653594772, |
|
"grad_norm": 1.6084789037704468, |
|
"learning_rate": 4.36104374844843e-06, |
|
"loss": 0.6474, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3071895424836601, |
|
"grad_norm": 1.7224164009094238, |
|
"learning_rate": 4.262711019652764e-06, |
|
"loss": 0.6372, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3725490196078431, |
|
"grad_norm": 1.5711984634399414, |
|
"learning_rate": 4.15862869389448e-06, |
|
"loss": 0.6379, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4379084967320261, |
|
"grad_norm": 1.7631185054779053, |
|
"learning_rate": 4.049136266086453e-06, |
|
"loss": 0.6302, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5032679738562091, |
|
"grad_norm": 1.8376095294952393, |
|
"learning_rate": 3.934590877769944e-06, |
|
"loss": 0.6378, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5686274509803921, |
|
"grad_norm": 2.0489087104797363, |
|
"learning_rate": 3.815366152193122e-06, |
|
"loss": 0.6164, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6339869281045751, |
|
"grad_norm": 1.8818341493606567, |
|
"learning_rate": 3.6918509756296876e-06, |
|
"loss": 0.6284, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6993464052287581, |
|
"grad_norm": 1.636940598487854, |
|
"learning_rate": 3.564448228912682e-06, |
|
"loss": 0.6223, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 1.699742078781128, |
|
"learning_rate": 3.4335734733209457e-06, |
|
"loss": 0.6212, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8300653594771243, |
|
"grad_norm": 1.6848982572555542, |
|
"learning_rate": 3.299653595104603e-06, |
|
"loss": 0.6241, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8954248366013071, |
|
"grad_norm": 1.8798364400863647, |
|
"learning_rate": 3.1631254130708446e-06, |
|
"loss": 0.6149, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9607843137254903, |
|
"grad_norm": 2.14373517036438, |
|
"learning_rate": 3.0244342537717735e-06, |
|
"loss": 0.6124, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.026143790849673, |
|
"grad_norm": 2.7039153575897217, |
|
"learning_rate": 2.8840324989417488e-06, |
|
"loss": 0.5466, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0915032679738563, |
|
"grad_norm": 2.615293025970459, |
|
"learning_rate": 2.742378109922204e-06, |
|
"loss": 0.4731, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.156862745098039, |
|
"grad_norm": 2.0649566650390625, |
|
"learning_rate": 2.599933133886934e-06, |
|
"loss": 0.4673, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.7854645252227783, |
|
"learning_rate": 2.457162196740252e-06, |
|
"loss": 0.4639, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.287581699346405, |
|
"grad_norm": 1.954106330871582, |
|
"learning_rate": 2.31453098760387e-06, |
|
"loss": 0.4732, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 1.7365140914916992, |
|
"learning_rate": 2.1725047398357677e-06, |
|
"loss": 0.468, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.418300653594771, |
|
"grad_norm": 1.9597340822219849, |
|
"learning_rate": 2.031546713535688e-06, |
|
"loss": 0.4646, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.4836601307189543, |
|
"grad_norm": 1.8259657621383667, |
|
"learning_rate": 1.8921166844869762e-06, |
|
"loss": 0.4584, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.549019607843137, |
|
"grad_norm": 1.9516103267669678, |
|
"learning_rate": 1.7546694444635394e-06, |
|
"loss": 0.4644, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.6143790849673203, |
|
"grad_norm": 1.826661229133606, |
|
"learning_rate": 1.6196533177936132e-06, |
|
"loss": 0.4674, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.6797385620915035, |
|
"grad_norm": 1.6825226545333862, |
|
"learning_rate": 1.487508699018987e-06, |
|
"loss": 0.4614, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.7450980392156863, |
|
"grad_norm": 1.7974435091018677, |
|
"learning_rate": 1.358666616419544e-06, |
|
"loss": 0.4676, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.810457516339869, |
|
"grad_norm": 1.6989048719406128, |
|
"learning_rate": 1.2335473260886046e-06, |
|
"loss": 0.4496, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.8758169934640523, |
|
"grad_norm": 1.7996526956558228, |
|
"learning_rate": 1.1125589411448996e-06, |
|
"loss": 0.4597, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 1.746968388557434, |
|
"learning_rate": 9.960961005524033e-07, |
|
"loss": 0.4532, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.0065359477124183, |
|
"grad_norm": 6.076014995574951, |
|
"learning_rate": 8.845386818900647e-07, |
|
"loss": 0.4454, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.0718954248366015, |
|
"grad_norm": 2.6995885372161865, |
|
"learning_rate": 7.782505622700964e-07, |
|
"loss": 0.3719, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.1372549019607843, |
|
"grad_norm": 2.025956869125366, |
|
"learning_rate": 6.775784314464717e-07, |
|
"loss": 0.3699, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.2026143790849675, |
|
"grad_norm": 1.8959600925445557, |
|
"learning_rate": 5.828506609850054e-07, |
|
"loss": 0.3585, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.2679738562091503, |
|
"grad_norm": 1.9729666709899902, |
|
"learning_rate": 4.943762331835622e-07, |
|
"loss": 0.3579, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 1.8463507890701294, |
|
"learning_rate": 4.1244373323601874e-07, |
|
"loss": 0.3572, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.3986928104575163, |
|
"grad_norm": 1.870890498161316, |
|
"learning_rate": 3.3732040792734734e-07, |
|
"loss": 0.3609, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.4640522875816995, |
|
"grad_norm": 1.8788135051727295, |
|
"learning_rate": 2.6925129393015196e-07, |
|
"loss": 0.3621, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 1.8110865354537964, |
|
"learning_rate": 2.0845841854597092e-07, |
|
"loss": 0.3544, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.5947712418300655, |
|
"grad_norm": 1.8861949443817139, |
|
"learning_rate": 1.5514007549836979e-07, |
|
"loss": 0.3617, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.6601307189542482, |
|
"grad_norm": 1.814979910850525, |
|
"learning_rate": 1.0947017814003258e-07, |
|
"loss": 0.3664, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.7254901960784315, |
|
"grad_norm": 1.8255079984664917, |
|
"learning_rate": 7.159769218354873e-08, |
|
"loss": 0.3603, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.7908496732026142, |
|
"grad_norm": 1.795508623123169, |
|
"learning_rate": 4.164614980622678e-08, |
|
"loss": 0.3604, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.8562091503267975, |
|
"grad_norm": 1.8044642210006714, |
|
"learning_rate": 1.9713246713805588e-08, |
|
"loss": 0.3551, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 1.8935906887054443, |
|
"learning_rate": 5.87052347736844e-09, |
|
"loss": 0.3599, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.9869281045751634, |
|
"grad_norm": 1.862278938293457, |
|
"learning_rate": 1.6313218287128396e-10, |
|
"loss": 0.3593, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 612, |
|
"total_flos": 1.3695396597968404e+19, |
|
"train_loss": 0.5748976978406407, |
|
"train_runtime": 13335.1785, |
|
"train_samples_per_second": 11.743, |
|
"train_steps_per_second": 0.046 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 612, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3695396597968404e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|