|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 352, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005681818181818182, |
|
"grad_norm": 61.450797180202606, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 2.2212, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.028409090909090908, |
|
"grad_norm": 64.39059584640545, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 2.1229, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.056818181818181816, |
|
"grad_norm": 6.91974733739602, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 1.6002, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08522727272727272, |
|
"grad_norm": 4.765316545239553, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.0834, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11363636363636363, |
|
"grad_norm": 2.7902519705279327, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.8354, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14204545454545456, |
|
"grad_norm": 1.4800453018498552, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.763, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17045454545454544, |
|
"grad_norm": 1.1251067723566994, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.7216, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19886363636363635, |
|
"grad_norm": 1.0559075058285379, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 0.7044, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 1.2534613751009074, |
|
"learning_rate": 1.999209397227302e-05, |
|
"loss": 0.6903, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2556818181818182, |
|
"grad_norm": 1.6289307968117395, |
|
"learning_rate": 1.995999715857997e-05, |
|
"loss": 0.6679, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2840909090909091, |
|
"grad_norm": 1.2476804276032436, |
|
"learning_rate": 1.9903294664725023e-05, |
|
"loss": 0.6615, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.9819151769054784, |
|
"learning_rate": 1.9822126571413616e-05, |
|
"loss": 0.6576, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3409090909090909, |
|
"grad_norm": 0.7874872470633155, |
|
"learning_rate": 1.97166934004041e-05, |
|
"loss": 0.644, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3693181818181818, |
|
"grad_norm": 0.8685235909784279, |
|
"learning_rate": 1.9587255619128648e-05, |
|
"loss": 0.6417, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3977272727272727, |
|
"grad_norm": 0.8355500376299562, |
|
"learning_rate": 1.9434132997221347e-05, |
|
"loss": 0.6415, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.42613636363636365, |
|
"grad_norm": 0.7019935692437175, |
|
"learning_rate": 1.9257703816543144e-05, |
|
"loss": 0.6351, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.7150838153313119, |
|
"learning_rate": 1.9058403936655235e-05, |
|
"loss": 0.6301, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.48295454545454547, |
|
"grad_norm": 0.777998409699531, |
|
"learning_rate": 1.8836725718049562e-05, |
|
"loss": 0.6323, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5113636363636364, |
|
"grad_norm": 0.9386535132438508, |
|
"learning_rate": 1.8593216805796612e-05, |
|
"loss": 0.6262, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5397727272727273, |
|
"grad_norm": 0.7833891503314497, |
|
"learning_rate": 1.8328478776615336e-05, |
|
"loss": 0.6226, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5681818181818182, |
|
"grad_norm": 0.6233679945069268, |
|
"learning_rate": 1.804316565270765e-05, |
|
"loss": 0.6215, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5965909090909091, |
|
"grad_norm": 0.6035859026208489, |
|
"learning_rate": 1.7737982286028938e-05, |
|
"loss": 0.6145, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.6777356473856782, |
|
"learning_rate": 1.7413682616986185e-05, |
|
"loss": 0.6131, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6534090909090909, |
|
"grad_norm": 0.6612579810251372, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.6126, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.5914172959469909, |
|
"learning_rate": 1.671098428359037e-05, |
|
"loss": 0.6109, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7102272727272727, |
|
"grad_norm": 0.6559176879020109, |
|
"learning_rate": 1.6334321600700612e-05, |
|
"loss": 0.6117, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7386363636363636, |
|
"grad_norm": 0.6684982270631625, |
|
"learning_rate": 1.5942010289717108e-05, |
|
"loss": 0.6076, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7670454545454546, |
|
"grad_norm": 0.6372604947073411, |
|
"learning_rate": 1.5535019536322158e-05, |
|
"loss": 0.6027, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7954545454545454, |
|
"grad_norm": 0.6994385308149875, |
|
"learning_rate": 1.5114354791034225e-05, |
|
"loss": 0.6101, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8238636363636364, |
|
"grad_norm": 0.595747522856983, |
|
"learning_rate": 1.4681055285292138e-05, |
|
"loss": 0.6029, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8522727272727273, |
|
"grad_norm": 0.5480963244425426, |
|
"learning_rate": 1.4236191464085286e-05, |
|
"loss": 0.6015, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8806818181818182, |
|
"grad_norm": 0.5398374925046976, |
|
"learning_rate": 1.3780862341472183e-05, |
|
"loss": 0.601, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.5638894303413493, |
|
"learning_rate": 1.331619278552068e-05, |
|
"loss": 0.597, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.6681163362607668, |
|
"learning_rate": 1.2843330739377003e-05, |
|
"loss": 0.5955, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9659090909090909, |
|
"grad_norm": 0.5783323357337038, |
|
"learning_rate": 1.2363444385329052e-05, |
|
"loss": 0.5938, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9943181818181818, |
|
"grad_norm": 0.5688860290477228, |
|
"learning_rate": 1.1877719258869827e-05, |
|
"loss": 0.5958, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6028689742088318, |
|
"eval_runtime": 4.9572, |
|
"eval_samples_per_second": 70.806, |
|
"eval_steps_per_second": 1.21, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.0227272727272727, |
|
"grad_norm": 0.5776530983447593, |
|
"learning_rate": 1.1387355319890685e-05, |
|
"loss": 0.5715, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0511363636363635, |
|
"grad_norm": 0.5351985628101088, |
|
"learning_rate": 1.0893563988239773e-05, |
|
"loss": 0.5662, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0795454545454546, |
|
"grad_norm": 0.555723989781456, |
|
"learning_rate": 1.039756515096926e-05, |
|
"loss": 0.5641, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1079545454545454, |
|
"grad_norm": 0.5820199823963064, |
|
"learning_rate": 9.900584148664705e-06, |
|
"loss": 0.5663, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 0.4816152928348661, |
|
"learning_rate": 9.403848748301802e-06, |
|
"loss": 0.5604, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1647727272727273, |
|
"grad_norm": 0.5345678354575093, |
|
"learning_rate": 8.908586110108794e-06, |
|
"loss": 0.5643, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.1931818181818181, |
|
"grad_norm": 0.5415303447950298, |
|
"learning_rate": 8.416019755927851e-06, |
|
"loss": 0.5612, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2215909090909092, |
|
"grad_norm": 0.5423360313378033, |
|
"learning_rate": 7.927366546564911e-06, |
|
"loss": 0.5615, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.4718754739107113, |
|
"learning_rate": 7.443833675595254e-06, |
|
"loss": 0.5606, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2784090909090908, |
|
"grad_norm": 0.4930197840237037, |
|
"learning_rate": 6.966615687051517e-06, |
|
"loss": 0.5637, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3068181818181819, |
|
"grad_norm": 0.48706485409767547, |
|
"learning_rate": 6.496891524361757e-06, |
|
"loss": 0.5628, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3352272727272727, |
|
"grad_norm": 0.4651113637210652, |
|
"learning_rate": 6.03582161782806e-06, |
|
"loss": 0.5632, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.5580807947723885, |
|
"learning_rate": 5.584545017840886e-06, |
|
"loss": 0.5592, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3920454545454546, |
|
"grad_norm": 0.5164154281834604, |
|
"learning_rate": 5.144176580911431e-06, |
|
"loss": 0.5569, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.4204545454545454, |
|
"grad_norm": 0.5214361592191502, |
|
"learning_rate": 4.7158042154738094e-06, |
|
"loss": 0.5568, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4488636363636362, |
|
"grad_norm": 0.4690952469954382, |
|
"learning_rate": 4.3004861942610575e-06, |
|
"loss": 0.5555, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4772727272727273, |
|
"grad_norm": 0.4614475658830548, |
|
"learning_rate": 3.899248539894756e-06, |
|
"loss": 0.5577, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5056818181818183, |
|
"grad_norm": 0.4420892345248393, |
|
"learning_rate": 3.513082490146864e-06, |
|
"loss": 0.554, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.5340909090909092, |
|
"grad_norm": 0.4142734694806993, |
|
"learning_rate": 3.1429420491358696e-06, |
|
"loss": 0.5552, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.4431041177417911, |
|
"learning_rate": 2.7897416305068325e-06, |
|
"loss": 0.5533, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 0.43134061646610855, |
|
"learning_rate": 2.454353798417698e-06, |
|
"loss": 0.5526, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6193181818181817, |
|
"grad_norm": 0.4436282928131185, |
|
"learning_rate": 2.137607111912734e-06, |
|
"loss": 0.5516, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.6477272727272727, |
|
"grad_norm": 0.42887307203464214, |
|
"learning_rate": 1.840284078008393e-06, |
|
"loss": 0.5529, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.6761363636363638, |
|
"grad_norm": 0.4325445800220451, |
|
"learning_rate": 1.5631192185484557e-06, |
|
"loss": 0.5509, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7045454545454546, |
|
"grad_norm": 0.42115802401305363, |
|
"learning_rate": 1.3067972556041753e-06, |
|
"loss": 0.5542, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7329545454545454, |
|
"grad_norm": 0.41199771612659136, |
|
"learning_rate": 1.0719514199022473e-06, |
|
"loss": 0.5524, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.7613636363636362, |
|
"grad_norm": 0.3982652388798851, |
|
"learning_rate": 8.591618864596541e-07, |
|
"loss": 0.5516, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7897727272727273, |
|
"grad_norm": 0.43027763841489375, |
|
"learning_rate": 6.689543412899913e-07, |
|
"loss": 0.5529, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.4194862616952164, |
|
"learning_rate": 5.017986827221733e-07, |
|
"loss": 0.5511, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.8465909090909092, |
|
"grad_norm": 0.3966942779222067, |
|
"learning_rate": 3.5810786053987025e-07, |
|
"loss": 0.5494, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.41819439445707, |
|
"learning_rate": 2.3823685580949273e-07, |
|
"loss": 0.5546, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.9034090909090908, |
|
"grad_norm": 0.40593144742056747, |
|
"learning_rate": 1.4248180391703614e-07, |
|
"loss": 0.547, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.9318181818181817, |
|
"grad_norm": 0.4100887183324941, |
|
"learning_rate": 7.10792629802659e-08, |
|
"loss": 0.5527, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9602272727272727, |
|
"grad_norm": 0.4017736667114781, |
|
"learning_rate": 2.420562944358329e-08, |
|
"loss": 0.55, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.9886363636363638, |
|
"grad_norm": 0.3955471531873619, |
|
"learning_rate": 1.9767022993444353e-09, |
|
"loss": 0.5494, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5863133072853088, |
|
"eval_runtime": 4.9616, |
|
"eval_samples_per_second": 70.744, |
|
"eval_steps_per_second": 1.209, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 352, |
|
"total_flos": 73701638799360.0, |
|
"train_loss": 0.6391877403313463, |
|
"train_runtime": 1025.7787, |
|
"train_samples_per_second": 21.89, |
|
"train_steps_per_second": 0.343 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 352, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 73701638799360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|