{ "best_metric": 1.8007385730743408, "best_model_checkpoint": "miner_id_24/checkpoint-50", "epoch": 0.8547008547008547, "eval_steps": 25, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017094017094017096, "grad_norm": 3.938539743423462, "learning_rate": 2.9999999999999997e-05, "loss": 3.4641, "step": 1 }, { "epoch": 0.017094017094017096, "eval_loss": 4.367037296295166, "eval_runtime": 1.8323, "eval_samples_per_second": 27.289, "eval_steps_per_second": 3.82, "step": 1 }, { "epoch": 0.03418803418803419, "grad_norm": 4.934310436248779, "learning_rate": 5.9999999999999995e-05, "loss": 3.7733, "step": 2 }, { "epoch": 0.05128205128205128, "grad_norm": 4.855484962463379, "learning_rate": 8.999999999999999e-05, "loss": 3.6143, "step": 3 }, { "epoch": 0.06837606837606838, "grad_norm": 5.6892476081848145, "learning_rate": 0.00011999999999999999, "loss": 3.701, "step": 4 }, { "epoch": 0.08547008547008547, "grad_norm": 4.726133823394775, "learning_rate": 0.00015, "loss": 3.2567, "step": 5 }, { "epoch": 0.10256410256410256, "grad_norm": 3.7231974601745605, "learning_rate": 0.00017999999999999998, "loss": 2.7275, "step": 6 }, { "epoch": 0.11965811965811966, "grad_norm": 2.9245429039001465, "learning_rate": 0.00020999999999999998, "loss": 2.7143, "step": 7 }, { "epoch": 0.13675213675213677, "grad_norm": 3.1964733600616455, "learning_rate": 0.00023999999999999998, "loss": 2.3785, "step": 8 }, { "epoch": 0.15384615384615385, "grad_norm": 3.5964674949645996, "learning_rate": 0.00027, "loss": 2.6142, "step": 9 }, { "epoch": 0.17094017094017094, "grad_norm": 3.1771557331085205, "learning_rate": 0.0003, "loss": 2.5761, "step": 10 }, { "epoch": 0.18803418803418803, "grad_norm": 2.4278526306152344, "learning_rate": 0.00029993535093404976, "loss": 2.1675, "step": 11 }, { "epoch": 0.20512820512820512, "grad_norm": 3.4053242206573486, "learning_rate": 0.00029974145946288876, "loss": 2.1913, "step": 12 }, { "epoch": 0.2222222222222222, "grad_norm": 3.4192192554473877, "learning_rate": 0.00029941849271855034, "loss": 2.1841, "step": 13 }, { "epoch": 0.23931623931623933, "grad_norm": 3.6070780754089355, "learning_rate": 0.00029896672909434605, "loss": 1.6725, "step": 14 }, { "epoch": 0.2564102564102564, "grad_norm": 3.5193164348602295, "learning_rate": 0.0002983865580048935, "loss": 2.4415, "step": 15 }, { "epoch": 0.27350427350427353, "grad_norm": 2.824167490005493, "learning_rate": 0.00029767847955044656, "loss": 2.1966, "step": 16 }, { "epoch": 0.2905982905982906, "grad_norm": 2.186579465866089, "learning_rate": 0.0002968431040858144, "loss": 2.1251, "step": 17 }, { "epoch": 0.3076923076923077, "grad_norm": 2.0862064361572266, "learning_rate": 0.0002958811516942438, "loss": 2.1255, "step": 18 }, { "epoch": 0.3247863247863248, "grad_norm": 2.040565013885498, "learning_rate": 0.0002947934515667162, "loss": 2.1153, "step": 19 }, { "epoch": 0.3418803418803419, "grad_norm": 2.1491634845733643, "learning_rate": 0.0002935809412871952, "loss": 2.1219, "step": 20 }, { "epoch": 0.358974358974359, "grad_norm": 2.158245325088501, "learning_rate": 0.00029224466602444126, "loss": 2.045, "step": 21 }, { "epoch": 0.37606837606837606, "grad_norm": 2.090855121612549, "learning_rate": 0.00029078577763108886, "loss": 1.9201, "step": 22 }, { "epoch": 0.39316239316239315, "grad_norm": 2.34712553024292, "learning_rate": 0.0002892055336507641, "loss": 2.1339, "step": 23 }, { "epoch": 0.41025641025641024, "grad_norm": 2.3505215644836426, "learning_rate": 0.00028750529623409766, "loss": 2.167, "step": 24 }, { "epoch": 0.42735042735042733, "grad_norm": 2.1497771739959717, "learning_rate": 0.0002856865309645679, "loss": 1.9427, "step": 25 }, { "epoch": 0.42735042735042733, "eval_loss": 2.0743188858032227, "eval_runtime": 1.8525, "eval_samples_per_second": 26.991, "eval_steps_per_second": 3.779, "step": 25 }, { "epoch": 0.4444444444444444, "grad_norm": 2.295942544937134, "learning_rate": 0.0002837508055951863, "loss": 1.8171, "step": 26 }, { "epoch": 0.46153846153846156, "grad_norm": 2.5262718200683594, "learning_rate": 0.00028169978869711386, "loss": 1.6738, "step": 27 }, { "epoch": 0.47863247863247865, "grad_norm": 2.7707014083862305, "learning_rate": 0.00027953524822137317, "loss": 1.6301, "step": 28 }, { "epoch": 0.49572649572649574, "grad_norm": 2.6095831394195557, "learning_rate": 0.00027725904997489724, "loss": 2.1898, "step": 29 }, { "epoch": 0.5128205128205128, "grad_norm": 2.4387624263763428, "learning_rate": 0.0002748731560122267, "loss": 2.2275, "step": 30 }, { "epoch": 0.5299145299145299, "grad_norm": 2.2945480346679688, "learning_rate": 0.00027237962294424355, "loss": 1.9293, "step": 31 }, { "epoch": 0.5470085470085471, "grad_norm": 2.197488784790039, "learning_rate": 0.0002697806001653979, "loss": 2.0488, "step": 32 }, { "epoch": 0.5641025641025641, "grad_norm": 2.2360706329345703, "learning_rate": 0.0002670783280009569, "loss": 2.0198, "step": 33 }, { "epoch": 0.5811965811965812, "grad_norm": 2.0630950927734375, "learning_rate": 0.00026427513577587217, "loss": 1.6752, "step": 34 }, { "epoch": 0.5982905982905983, "grad_norm": 2.5239932537078857, "learning_rate": 0.0002613734398069308, "loss": 1.9719, "step": 35 }, { "epoch": 0.6153846153846154, "grad_norm": 2.581416130065918, "learning_rate": 0.0002583757413199203, "loss": 2.0279, "step": 36 }, { "epoch": 0.6324786324786325, "grad_norm": 2.4350247383117676, "learning_rate": 0.0002552846242936032, "loss": 2.0634, "step": 37 }, { "epoch": 0.6495726495726496, "grad_norm": 2.4420244693756104, "learning_rate": 0.00025210275323235944, "loss": 2.0166, "step": 38 }, { "epoch": 0.6666666666666666, "grad_norm": 2.505471706390381, "learning_rate": 0.00024883287086941667, "loss": 1.8165, "step": 39 }, { "epoch": 0.6837606837606838, "grad_norm": 2.3507983684539795, "learning_rate": 0.00024547779580264874, "loss": 1.6553, "step": 40 }, { "epoch": 0.7008547008547008, "grad_norm": 2.3374176025390625, "learning_rate": 0.00024204042006497906, "loss": 1.2892, "step": 41 }, { "epoch": 0.717948717948718, "grad_norm": 2.7650320529937744, "learning_rate": 0.0002385237066314845, "loss": 1.3018, "step": 42 }, { "epoch": 0.7350427350427351, "grad_norm": 1.938658356666565, "learning_rate": 0.00023493068686534756, "loss": 1.8023, "step": 43 }, { "epoch": 0.7521367521367521, "grad_norm": 2.1138954162597656, "learning_rate": 0.00023126445790485917, "loss": 2.0602, "step": 44 }, { "epoch": 0.7692307692307693, "grad_norm": 2.6716885566711426, "learning_rate": 0.0002275281799937241, "loss": 1.9736, "step": 45 }, { "epoch": 0.7863247863247863, "grad_norm": 2.3430912494659424, "learning_rate": 0.00022372507375697015, "loss": 1.8303, "step": 46 }, { "epoch": 0.8034188034188035, "grad_norm": 2.5371756553649902, "learning_rate": 0.00021985841742480953, "loss": 1.6751, "step": 47 }, { "epoch": 0.8205128205128205, "grad_norm": 2.3102569580078125, "learning_rate": 0.00021593154400684523, "loss": 1.7225, "step": 48 }, { "epoch": 0.8376068376068376, "grad_norm": 2.714444637298584, "learning_rate": 0.00021194783841905826, "loss": 1.7524, "step": 49 }, { "epoch": 0.8547008547008547, "grad_norm": 2.427034616470337, "learning_rate": 0.00020791073456605222, "loss": 1.5867, "step": 50 }, { "epoch": 0.8547008547008547, "eval_loss": 1.8007385730743408, "eval_runtime": 1.8575, "eval_samples_per_second": 26.918, "eval_steps_per_second": 3.768, "step": 50 } ], "logging_steps": 1, "max_steps": 117, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.09052260892672e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }