{ "best_metric": NaN, "best_model_checkpoint": "miner_id_24/checkpoint-25", "epoch": 3.128342245989305, "eval_steps": 25, "global_step": 36, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0855614973262032, "grad_norm": 0.3136516213417053, "learning_rate": 5e-05, "loss": 2.3294, "step": 1 }, { "epoch": 0.0855614973262032, "eval_loss": NaN, "eval_runtime": 0.5577, "eval_samples_per_second": 141.657, "eval_steps_per_second": 17.931, "step": 1 }, { "epoch": 0.1711229946524064, "grad_norm": 0.4243817627429962, "learning_rate": 0.0001, "loss": 2.4175, "step": 2 }, { "epoch": 0.25668449197860965, "grad_norm": 0.35335254669189453, "learning_rate": 9.978670881475172e-05, "loss": 2.5057, "step": 3 }, { "epoch": 0.3422459893048128, "grad_norm": 0.34849148988723755, "learning_rate": 9.91486549841951e-05, "loss": 2.3986, "step": 4 }, { "epoch": 0.42780748663101603, "grad_norm": 0.4511032700538635, "learning_rate": 9.809128215864097e-05, "loss": 2.3214, "step": 5 }, { "epoch": 0.5133689839572193, "grad_norm": 0.32617124915122986, "learning_rate": 9.662361147021779e-05, "loss": 2.2797, "step": 6 }, { "epoch": 0.5989304812834224, "grad_norm": 0.4399779438972473, "learning_rate": 9.475816456775313e-05, "loss": 2.2683, "step": 7 }, { "epoch": 0.6844919786096256, "grad_norm": 0.30387502908706665, "learning_rate": 9.251085678648072e-05, "loss": 1.3523, "step": 8 }, { "epoch": 0.7700534759358288, "grad_norm": 0.3294680118560791, "learning_rate": 8.9900861364012e-05, "loss": 2.2551, "step": 9 }, { "epoch": 0.8556149732620321, "grad_norm": 0.42735546827316284, "learning_rate": 8.695044586103296e-05, "loss": 2.0628, "step": 10 }, { "epoch": 0.9411764705882353, "grad_norm": 0.0, "learning_rate": 8.368478218232787e-05, "loss": 0.0, "step": 11 }, { "epoch": 1.0427807486631016, "grad_norm": 0.6370770931243896, "learning_rate": 8.013173181896283e-05, "loss": 3.2772, "step": 12 }, { "epoch": 1.1283422459893049, "grad_norm": 0.5347968935966492, "learning_rate": 7.63216081438678e-05, "loss": 2.3266, "step": 13 }, { "epoch": 1.213903743315508, "grad_norm": 0.061232052743434906, "learning_rate": 7.228691778882693e-05, "loss": 0.2754, "step": 14 }, { "epoch": 1.299465240641711, "grad_norm": 0.5249614715576172, "learning_rate": 6.806208330935766e-05, "loss": 3.6869, "step": 15 }, { "epoch": 1.3850267379679144, "grad_norm": 0.6129370331764221, "learning_rate": 6.368314950360415e-05, "loss": 2.2587, "step": 16 }, { "epoch": 1.4705882352941178, "grad_norm": 0.0, "learning_rate": 5.918747589082853e-05, "loss": 0.0, "step": 17 }, { "epoch": 1.5561497326203209, "grad_norm": 0.475963294506073, "learning_rate": 5.4613417973165106e-05, "loss": 2.8515, "step": 18 }, { "epoch": 1.641711229946524, "grad_norm": 0.17610086500644684, "learning_rate": 5e-05, "loss": 105.5296, "step": 19 }, { "epoch": 1.7272727272727273, "grad_norm": 0.14969512820243835, "learning_rate": 4.5386582026834906e-05, "loss": 0.7518, "step": 20 }, { "epoch": 1.8128342245989306, "grad_norm": 0.4916137456893921, "learning_rate": 4.0812524109171476e-05, "loss": 2.0669, "step": 21 }, { "epoch": 1.8983957219251337, "grad_norm": 0.9707525372505188, "learning_rate": 3.631685049639586e-05, "loss": 108.2932, "step": 22 }, { "epoch": 1.9839572192513368, "grad_norm": 0.31792283058166504, "learning_rate": 3.1937916690642356e-05, "loss": 1.6389, "step": 23 }, { "epoch": 2.085561497326203, "grad_norm": 0.2978770434856415, "learning_rate": 2.771308221117309e-05, "loss": 1.7533, "step": 24 }, { "epoch": 2.171122994652406, "grad_norm": 0.3738488554954529, "learning_rate": 2.3678391856132204e-05, "loss": 1.494, "step": 25 }, { "epoch": 2.171122994652406, "eval_loss": NaN, "eval_runtime": 0.5576, "eval_samples_per_second": 141.667, "eval_steps_per_second": 17.933, "step": 25 }, { "epoch": 2.2566844919786098, "grad_norm": 0.2852610945701599, "learning_rate": 1.9868268181037185e-05, "loss": 1.9419, "step": 26 }, { "epoch": 2.342245989304813, "grad_norm": 0.3041047751903534, "learning_rate": 1.631521781767214e-05, "loss": 1.6465, "step": 27 }, { "epoch": 2.427807486631016, "grad_norm": 0.4177848696708679, "learning_rate": 1.3049554138967051e-05, "loss": 1.4746, "step": 28 }, { "epoch": 2.5133689839572195, "grad_norm": 0.30657851696014404, "learning_rate": 1.0099138635988026e-05, "loss": 1.7037, "step": 29 }, { "epoch": 2.598930481283422, "grad_norm": 0.3365192115306854, "learning_rate": 7.489143213519301e-06, "loss": 1.484, "step": 30 }, { "epoch": 2.6844919786096257, "grad_norm": 0.2342955768108368, "learning_rate": 5.241835432246889e-06, "loss": 0.6956, "step": 31 }, { "epoch": 2.770053475935829, "grad_norm": 0.3052198588848114, "learning_rate": 3.376388529782215e-06, "loss": 1.6939, "step": 32 }, { "epoch": 2.855614973262032, "grad_norm": 0.34793445467948914, "learning_rate": 1.908717841359048e-06, "loss": 1.4545, "step": 33 }, { "epoch": 2.9411764705882355, "grad_norm": 0.0, "learning_rate": 8.513450158049108e-07, "loss": 0.0, "step": 34 }, { "epoch": 3.0427807486631018, "grad_norm": 0.5034242868423462, "learning_rate": 2.1329118524827662e-07, "loss": 2.7541, "step": 35 }, { "epoch": 3.128342245989305, "grad_norm": 0.4670789837837219, "learning_rate": 0.0, "loss": 1.8256, "step": 36 } ], "logging_steps": 1, "max_steps": 36, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.879899736375296e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }