oldiday's picture
Training in progress, step 100, checkpoint
c516232 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.06521030322791001,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006521030322791001,
"eval_loss": 1.4430840015411377,
"eval_runtime": 35.4265,
"eval_samples_per_second": 72.911,
"eval_steps_per_second": 9.117,
"step": 1
},
{
"epoch": 0.0019563090968373,
"grad_norm": 0.13245901465415955,
"learning_rate": 1.5e-05,
"loss": 1.3922,
"step": 3
},
{
"epoch": 0.0039126181936746,
"grad_norm": 0.13980551064014435,
"learning_rate": 3e-05,
"loss": 1.35,
"step": 6
},
{
"epoch": 0.0058689272905119005,
"grad_norm": 0.21757693588733673,
"learning_rate": 4.5e-05,
"loss": 1.4342,
"step": 9
},
{
"epoch": 0.0058689272905119005,
"eval_loss": 1.4398877620697021,
"eval_runtime": 35.4144,
"eval_samples_per_second": 72.936,
"eval_steps_per_second": 9.121,
"step": 9
},
{
"epoch": 0.0078252363873492,
"grad_norm": 0.16242149472236633,
"learning_rate": 4.993910125649561e-05,
"loss": 1.3931,
"step": 12
},
{
"epoch": 0.0097815454841865,
"grad_norm": 0.15830300748348236,
"learning_rate": 4.962019382530521e-05,
"loss": 1.367,
"step": 15
},
{
"epoch": 0.011737854581023801,
"grad_norm": 0.16229018568992615,
"learning_rate": 4.9031542398457974e-05,
"loss": 1.4666,
"step": 18
},
{
"epoch": 0.011737854581023801,
"eval_loss": 1.4245867729187012,
"eval_runtime": 35.2817,
"eval_samples_per_second": 73.211,
"eval_steps_per_second": 9.155,
"step": 18
},
{
"epoch": 0.013694163677861103,
"grad_norm": 0.14549140632152557,
"learning_rate": 4.817959636416969e-05,
"loss": 1.4435,
"step": 21
},
{
"epoch": 0.0156504727746984,
"grad_norm": 0.12280628830194473,
"learning_rate": 4.707368982147318e-05,
"loss": 1.4289,
"step": 24
},
{
"epoch": 0.017606781871535703,
"grad_norm": 0.1361985057592392,
"learning_rate": 4.572593931387604e-05,
"loss": 1.4174,
"step": 27
},
{
"epoch": 0.017606781871535703,
"eval_loss": 1.4081462621688843,
"eval_runtime": 35.2327,
"eval_samples_per_second": 73.313,
"eval_steps_per_second": 9.168,
"step": 27
},
{
"epoch": 0.019563090968373,
"grad_norm": 0.1305856853723526,
"learning_rate": 4.415111107797445e-05,
"loss": 1.3956,
"step": 30
},
{
"epoch": 0.021519400065210303,
"grad_norm": 0.13506704568862915,
"learning_rate": 4.2366459261474933e-05,
"loss": 1.4642,
"step": 33
},
{
"epoch": 0.023475709162047602,
"grad_norm": 0.1571059674024582,
"learning_rate": 4.039153688314145e-05,
"loss": 1.4487,
"step": 36
},
{
"epoch": 0.023475709162047602,
"eval_loss": 1.3942054510116577,
"eval_runtime": 35.4024,
"eval_samples_per_second": 72.961,
"eval_steps_per_second": 9.124,
"step": 36
},
{
"epoch": 0.025432018258884904,
"grad_norm": 0.12497388571500778,
"learning_rate": 3.824798160583012e-05,
"loss": 1.345,
"step": 39
},
{
"epoch": 0.027388327355722206,
"grad_norm": 0.1353573054075241,
"learning_rate": 3.5959278669726935e-05,
"loss": 1.3613,
"step": 42
},
{
"epoch": 0.029344636452559504,
"grad_norm": 0.11737760901451111,
"learning_rate": 3.355050358314172e-05,
"loss": 1.3878,
"step": 45
},
{
"epoch": 0.029344636452559504,
"eval_loss": 1.383650302886963,
"eval_runtime": 35.4483,
"eval_samples_per_second": 72.867,
"eval_steps_per_second": 9.112,
"step": 45
},
{
"epoch": 0.0313009455493968,
"grad_norm": 0.11926010251045227,
"learning_rate": 3.104804738999169e-05,
"loss": 1.4205,
"step": 48
},
{
"epoch": 0.03325725464623411,
"grad_norm": 0.1115192323923111,
"learning_rate": 2.8479327524001636e-05,
"loss": 1.3493,
"step": 51
},
{
"epoch": 0.035213563743071406,
"grad_norm": 0.11480339616537094,
"learning_rate": 2.587248741756253e-05,
"loss": 1.3904,
"step": 54
},
{
"epoch": 0.035213563743071406,
"eval_loss": 1.376178503036499,
"eval_runtime": 35.4933,
"eval_samples_per_second": 72.774,
"eval_steps_per_second": 9.1,
"step": 54
},
{
"epoch": 0.037169872839908705,
"grad_norm": 0.11822472512722015,
"learning_rate": 2.3256088156396868e-05,
"loss": 1.3313,
"step": 57
},
{
"epoch": 0.039126181936746,
"grad_norm": 0.1461370587348938,
"learning_rate": 2.0658795558326743e-05,
"loss": 1.3558,
"step": 60
},
{
"epoch": 0.04108249103358331,
"grad_norm": 0.13045533001422882,
"learning_rate": 1.8109066104575023e-05,
"loss": 1.2992,
"step": 63
},
{
"epoch": 0.04108249103358331,
"eval_loss": 1.3712804317474365,
"eval_runtime": 35.3891,
"eval_samples_per_second": 72.989,
"eval_steps_per_second": 9.127,
"step": 63
},
{
"epoch": 0.04303880013042061,
"grad_norm": 0.13522818684577942,
"learning_rate": 1.56348351646022e-05,
"loss": 1.244,
"step": 66
},
{
"epoch": 0.044995109227257905,
"grad_norm": 0.1401805877685547,
"learning_rate": 1.3263210930352737e-05,
"loss": 1.3158,
"step": 69
},
{
"epoch": 0.046951418324095204,
"grad_norm": 0.16507139801979065,
"learning_rate": 1.1020177413231334e-05,
"loss": 1.3923,
"step": 72
},
{
"epoch": 0.046951418324095204,
"eval_loss": 1.368349313735962,
"eval_runtime": 35.4333,
"eval_samples_per_second": 72.898,
"eval_steps_per_second": 9.116,
"step": 72
},
{
"epoch": 0.04890772742093251,
"grad_norm": 0.1617535650730133,
"learning_rate": 8.930309757836517e-06,
"loss": 1.4659,
"step": 75
},
{
"epoch": 0.05086403651776981,
"grad_norm": 0.15123188495635986,
"learning_rate": 7.016504991533726e-06,
"loss": 1.4194,
"step": 78
},
{
"epoch": 0.052820345614607106,
"grad_norm": 0.1255597174167633,
"learning_rate": 5.299731159831953e-06,
"loss": 1.3503,
"step": 81
},
{
"epoch": 0.052820345614607106,
"eval_loss": 1.3665692806243896,
"eval_runtime": 35.4134,
"eval_samples_per_second": 72.938,
"eval_steps_per_second": 9.121,
"step": 81
},
{
"epoch": 0.05477665471144441,
"grad_norm": 0.1431252807378769,
"learning_rate": 3.798797596089351e-06,
"loss": 1.3153,
"step": 84
},
{
"epoch": 0.05673296380828171,
"grad_norm": 0.1532163769006729,
"learning_rate": 2.5301488425208296e-06,
"loss": 1.3877,
"step": 87
},
{
"epoch": 0.05868927290511901,
"grad_norm": 0.15569448471069336,
"learning_rate": 1.5076844803522922e-06,
"loss": 1.4249,
"step": 90
},
{
"epoch": 0.05868927290511901,
"eval_loss": 1.3660001754760742,
"eval_runtime": 35.493,
"eval_samples_per_second": 72.775,
"eval_steps_per_second": 9.1,
"step": 90
},
{
"epoch": 0.06064558200195631,
"grad_norm": 0.1278517246246338,
"learning_rate": 7.426068431000882e-07,
"loss": 1.4409,
"step": 93
},
{
"epoch": 0.0626018910987936,
"grad_norm": 0.10912717878818512,
"learning_rate": 2.4329828146074095e-07,
"loss": 1.331,
"step": 96
},
{
"epoch": 0.06455820019563091,
"grad_norm": 0.143110990524292,
"learning_rate": 1.522932452260595e-08,
"loss": 1.3777,
"step": 99
},
{
"epoch": 0.06455820019563091,
"eval_loss": 1.3658305406570435,
"eval_runtime": 35.5981,
"eval_samples_per_second": 72.56,
"eval_steps_per_second": 9.074,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3352559257387008.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}