{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.802407221664995, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0160481444332999, "grad_norm": 0.17083685100078583, "learning_rate": 1e-05, "loss": 2.3942, "step": 1 }, { "epoch": 0.0160481444332999, "eval_loss": 2.52895188331604, "eval_runtime": 22.9149, "eval_samples_per_second": 9.164, "eval_steps_per_second": 4.582, "step": 1 }, { "epoch": 0.0320962888665998, "grad_norm": 0.14391367137432098, "learning_rate": 2e-05, "loss": 2.4064, "step": 2 }, { "epoch": 0.048144433299899696, "grad_norm": 0.16619427502155304, "learning_rate": 3e-05, "loss": 2.418, "step": 3 }, { "epoch": 0.0641925777331996, "grad_norm": 0.143926203250885, "learning_rate": 4e-05, "loss": 2.505, "step": 4 }, { "epoch": 0.0802407221664995, "grad_norm": 0.1302526593208313, "learning_rate": 5e-05, "loss": 2.4878, "step": 5 }, { "epoch": 0.09628886659979939, "grad_norm": 0.13934171199798584, "learning_rate": 6e-05, "loss": 2.5336, "step": 6 }, { "epoch": 0.1123370110330993, "grad_norm": 0.1536237895488739, "learning_rate": 7e-05, "loss": 2.5089, "step": 7 }, { "epoch": 0.1283851554663992, "grad_norm": 0.1386164426803589, "learning_rate": 8e-05, "loss": 2.6297, "step": 8 }, { "epoch": 0.1444332998996991, "grad_norm": 0.1524312049150467, "learning_rate": 9e-05, "loss": 2.3858, "step": 9 }, { "epoch": 0.160481444332999, "grad_norm": 0.14153587818145752, "learning_rate": 0.0001, "loss": 2.4744, "step": 10 }, { "epoch": 0.1765295887662989, "grad_norm": 0.15249013900756836, "learning_rate": 9.98458666866564e-05, "loss": 2.4318, "step": 11 }, { "epoch": 0.19257773319959878, "grad_norm": 0.14801257848739624, "learning_rate": 9.938441702975689e-05, "loss": 2.4946, "step": 12 }, { "epoch": 0.2086258776328987, "grad_norm": 0.1329232156276703, "learning_rate": 9.861849601988383e-05, "loss": 2.6347, "step": 13 }, { "epoch": 0.2086258776328987, "eval_loss": 2.5210044384002686, "eval_runtime": 22.4203, "eval_samples_per_second": 9.367, "eval_steps_per_second": 4.683, "step": 13 }, { "epoch": 0.2246740220661986, "grad_norm": 0.14492028951644897, "learning_rate": 9.755282581475769e-05, "loss": 2.5018, "step": 14 }, { "epoch": 0.24072216649949849, "grad_norm": 0.13506127893924713, "learning_rate": 9.619397662556435e-05, "loss": 2.5642, "step": 15 }, { "epoch": 0.2567703109327984, "grad_norm": 0.133815199136734, "learning_rate": 9.45503262094184e-05, "loss": 2.484, "step": 16 }, { "epoch": 0.2728184553660983, "grad_norm": 0.1350664347410202, "learning_rate": 9.263200821770461e-05, "loss": 2.5023, "step": 17 }, { "epoch": 0.2888665997993982, "grad_norm": 0.13471205532550812, "learning_rate": 9.045084971874738e-05, "loss": 2.5607, "step": 18 }, { "epoch": 0.3049147442326981, "grad_norm": 0.12733304500579834, "learning_rate": 8.802029828000156e-05, "loss": 2.3489, "step": 19 }, { "epoch": 0.320962888665998, "grad_norm": 0.13001815974712372, "learning_rate": 8.535533905932738e-05, "loss": 2.5144, "step": 20 }, { "epoch": 0.3370110330992979, "grad_norm": 0.13591095805168152, "learning_rate": 8.247240241650918e-05, "loss": 2.2412, "step": 21 }, { "epoch": 0.3530591775325978, "grad_norm": 0.13400661945343018, "learning_rate": 7.938926261462366e-05, "loss": 2.4606, "step": 22 }, { "epoch": 0.3691073219658977, "grad_norm": 0.13440661132335663, "learning_rate": 7.612492823579745e-05, "loss": 2.526, "step": 23 }, { "epoch": 0.38515546639919757, "grad_norm": 0.13519461452960968, "learning_rate": 7.269952498697734e-05, "loss": 2.5468, "step": 24 }, { "epoch": 0.4012036108324975, "grad_norm": 0.14065755903720856, "learning_rate": 6.91341716182545e-05, "loss": 2.4216, "step": 25 }, { "epoch": 0.4172517552657974, "grad_norm": 0.12566477060317993, "learning_rate": 6.545084971874738e-05, "loss": 2.3784, "step": 26 }, { "epoch": 0.4172517552657974, "eval_loss": 2.5140509605407715, "eval_runtime": 22.4277, "eval_samples_per_second": 9.363, "eval_steps_per_second": 4.682, "step": 26 }, { "epoch": 0.43329989969909727, "grad_norm": 0.12099477648735046, "learning_rate": 6.167226819279528e-05, "loss": 2.4904, "step": 27 }, { "epoch": 0.4493480441323972, "grad_norm": 0.13227578997612, "learning_rate": 5.782172325201155e-05, "loss": 2.5992, "step": 28 }, { "epoch": 0.4653961885656971, "grad_norm": 0.11921201646327972, "learning_rate": 5.392295478639225e-05, "loss": 2.4221, "step": 29 }, { "epoch": 0.48144433299899697, "grad_norm": 0.12202885746955872, "learning_rate": 5e-05, "loss": 2.624, "step": 30 }, { "epoch": 0.4974924774322969, "grad_norm": 0.11582452803850174, "learning_rate": 4.607704521360776e-05, "loss": 2.4619, "step": 31 }, { "epoch": 0.5135406218655968, "grad_norm": 0.11963976919651031, "learning_rate": 4.2178276747988446e-05, "loss": 2.4558, "step": 32 }, { "epoch": 0.5295887662988967, "grad_norm": 0.13530096411705017, "learning_rate": 3.832773180720475e-05, "loss": 2.3744, "step": 33 }, { "epoch": 0.5456369107321966, "grad_norm": 0.1295008808374405, "learning_rate": 3.4549150281252636e-05, "loss": 2.3498, "step": 34 }, { "epoch": 0.5616850551654965, "grad_norm": 0.11402874439954758, "learning_rate": 3.086582838174551e-05, "loss": 2.514, "step": 35 }, { "epoch": 0.5777331995987964, "grad_norm": 0.12555132806301117, "learning_rate": 2.7300475013022663e-05, "loss": 2.3917, "step": 36 }, { "epoch": 0.5937813440320963, "grad_norm": 0.11757726967334747, "learning_rate": 2.3875071764202563e-05, "loss": 2.5616, "step": 37 }, { "epoch": 0.6098294884653962, "grad_norm": 0.12606336176395416, "learning_rate": 2.061073738537635e-05, "loss": 2.5237, "step": 38 }, { "epoch": 0.6258776328986961, "grad_norm": 0.12942460179328918, "learning_rate": 1.7527597583490822e-05, "loss": 2.4834, "step": 39 }, { "epoch": 0.6258776328986961, "eval_loss": 2.511258840560913, "eval_runtime": 22.4533, "eval_samples_per_second": 9.353, "eval_steps_per_second": 4.676, "step": 39 }, { "epoch": 0.641925777331996, "grad_norm": 0.1265319287776947, "learning_rate": 1.4644660940672627e-05, "loss": 2.4436, "step": 40 }, { "epoch": 0.6579739217652959, "grad_norm": 0.11708834022283554, "learning_rate": 1.1979701719998453e-05, "loss": 2.507, "step": 41 }, { "epoch": 0.6740220661985958, "grad_norm": 0.12580640614032745, "learning_rate": 9.549150281252633e-06, "loss": 2.5904, "step": 42 }, { "epoch": 0.6900702106318957, "grad_norm": 0.12004761397838593, "learning_rate": 7.367991782295391e-06, "loss": 2.5087, "step": 43 }, { "epoch": 0.7061183550651956, "grad_norm": 0.11525596678256989, "learning_rate": 5.449673790581611e-06, "loss": 2.4277, "step": 44 }, { "epoch": 0.7221664994984955, "grad_norm": 0.12065120041370392, "learning_rate": 3.8060233744356633e-06, "loss": 2.4564, "step": 45 }, { "epoch": 0.7382146439317954, "grad_norm": 0.12320233881473541, "learning_rate": 2.4471741852423237e-06, "loss": 2.4538, "step": 46 }, { "epoch": 0.7542627883650953, "grad_norm": 0.11359357833862305, "learning_rate": 1.3815039801161721e-06, "loss": 2.5531, "step": 47 }, { "epoch": 0.7703109327983951, "grad_norm": 0.11881112307310104, "learning_rate": 6.15582970243117e-07, "loss": 2.4245, "step": 48 }, { "epoch": 0.7863590772316951, "grad_norm": 0.12177315354347229, "learning_rate": 1.5413331334360182e-07, "loss": 2.601, "step": 49 }, { "epoch": 0.802407221664995, "grad_norm": 0.11372490227222443, "learning_rate": 0.0, "loss": 2.6062, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.498903514336461e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }