|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.802407221664995, |
|
"eval_steps": 13, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0160481444332999, |
|
"grad_norm": 0.17083685100078583, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3942, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0160481444332999, |
|
"eval_loss": 2.52895188331604, |
|
"eval_runtime": 22.9149, |
|
"eval_samples_per_second": 9.164, |
|
"eval_steps_per_second": 4.582, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0320962888665998, |
|
"grad_norm": 0.14391367137432098, |
|
"learning_rate": 2e-05, |
|
"loss": 2.4064, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.048144433299899696, |
|
"grad_norm": 0.16619427502155304, |
|
"learning_rate": 3e-05, |
|
"loss": 2.418, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0641925777331996, |
|
"grad_norm": 0.143926203250885, |
|
"learning_rate": 4e-05, |
|
"loss": 2.505, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0802407221664995, |
|
"grad_norm": 0.1302526593208313, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4878, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09628886659979939, |
|
"grad_norm": 0.13934171199798584, |
|
"learning_rate": 6e-05, |
|
"loss": 2.5336, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1123370110330993, |
|
"grad_norm": 0.1536237895488739, |
|
"learning_rate": 7e-05, |
|
"loss": 2.5089, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1283851554663992, |
|
"grad_norm": 0.1386164426803589, |
|
"learning_rate": 8e-05, |
|
"loss": 2.6297, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1444332998996991, |
|
"grad_norm": 0.1524312049150467, |
|
"learning_rate": 9e-05, |
|
"loss": 2.3858, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.160481444332999, |
|
"grad_norm": 0.14153587818145752, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4744, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1765295887662989, |
|
"grad_norm": 0.15249013900756836, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 2.4318, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19257773319959878, |
|
"grad_norm": 0.14801257848739624, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 2.4946, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2086258776328987, |
|
"grad_norm": 0.1329232156276703, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 2.6347, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2086258776328987, |
|
"eval_loss": 2.5210044384002686, |
|
"eval_runtime": 22.4203, |
|
"eval_samples_per_second": 9.367, |
|
"eval_steps_per_second": 4.683, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2246740220661986, |
|
"grad_norm": 0.14492028951644897, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 2.5018, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24072216649949849, |
|
"grad_norm": 0.13506127893924713, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 2.5642, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2567703109327984, |
|
"grad_norm": 0.133815199136734, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 2.484, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2728184553660983, |
|
"grad_norm": 0.1350664347410202, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 2.5023, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2888665997993982, |
|
"grad_norm": 0.13471205532550812, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.5607, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3049147442326981, |
|
"grad_norm": 0.12733304500579834, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 2.3489, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.320962888665998, |
|
"grad_norm": 0.13001815974712372, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 2.5144, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3370110330992979, |
|
"grad_norm": 0.13591095805168152, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 2.2412, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3530591775325978, |
|
"grad_norm": 0.13400661945343018, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 2.4606, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3691073219658977, |
|
"grad_norm": 0.13440661132335663, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 2.526, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.38515546639919757, |
|
"grad_norm": 0.13519461452960968, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 2.5468, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4012036108324975, |
|
"grad_norm": 0.14065755903720856, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 2.4216, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4172517552657974, |
|
"grad_norm": 0.12566477060317993, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 2.3784, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.4172517552657974, |
|
"eval_loss": 2.5140509605407715, |
|
"eval_runtime": 22.4277, |
|
"eval_samples_per_second": 9.363, |
|
"eval_steps_per_second": 4.682, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.43329989969909727, |
|
"grad_norm": 0.12099477648735046, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 2.4904, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4493480441323972, |
|
"grad_norm": 0.13227578997612, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 2.5992, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4653961885656971, |
|
"grad_norm": 0.11921201646327972, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 2.4221, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.48144433299899697, |
|
"grad_norm": 0.12202885746955872, |
|
"learning_rate": 5e-05, |
|
"loss": 2.624, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4974924774322969, |
|
"grad_norm": 0.11582452803850174, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 2.4619, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5135406218655968, |
|
"grad_norm": 0.11963976919651031, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 2.4558, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5295887662988967, |
|
"grad_norm": 0.13530096411705017, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 2.3744, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5456369107321966, |
|
"grad_norm": 0.1295008808374405, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 2.3498, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5616850551654965, |
|
"grad_norm": 0.11402874439954758, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 2.514, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5777331995987964, |
|
"grad_norm": 0.12555132806301117, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 2.3917, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5937813440320963, |
|
"grad_norm": 0.11757726967334747, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 2.5616, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6098294884653962, |
|
"grad_norm": 0.12606336176395416, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 2.5237, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6258776328986961, |
|
"grad_norm": 0.12942460179328918, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 2.4834, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6258776328986961, |
|
"eval_loss": 2.511258840560913, |
|
"eval_runtime": 22.4533, |
|
"eval_samples_per_second": 9.353, |
|
"eval_steps_per_second": 4.676, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.641925777331996, |
|
"grad_norm": 0.1265319287776947, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 2.4436, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6579739217652959, |
|
"grad_norm": 0.11708834022283554, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 2.507, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6740220661985958, |
|
"grad_norm": 0.12580640614032745, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 2.5904, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6900702106318957, |
|
"grad_norm": 0.12004761397838593, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 2.5087, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7061183550651956, |
|
"grad_norm": 0.11525596678256989, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 2.4277, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7221664994984955, |
|
"grad_norm": 0.12065120041370392, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 2.4564, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7382146439317954, |
|
"grad_norm": 0.12320233881473541, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 2.4538, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7542627883650953, |
|
"grad_norm": 0.11359357833862305, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 2.5531, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7703109327983951, |
|
"grad_norm": 0.11881112307310104, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 2.4245, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7863590772316951, |
|
"grad_norm": 0.12177315354347229, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 2.601, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.802407221664995, |
|
"grad_norm": 0.11372490227222443, |
|
"learning_rate": 0.0, |
|
"loss": 2.6062, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.498903514336461e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|