{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.012697337791509713, "eval_steps": 6, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002539467558301943, "grad_norm": 2.3970961570739746, "learning_rate": 1e-05, "loss": 66.6603, "step": 1 }, { "epoch": 0.0002539467558301943, "eval_loss": 11.107478141784668, "eval_runtime": 18.8123, "eval_samples_per_second": 264.454, "eval_steps_per_second": 66.127, "step": 1 }, { "epoch": 0.0005078935116603886, "grad_norm": 2.4752585887908936, "learning_rate": 2e-05, "loss": 66.6705, "step": 2 }, { "epoch": 0.0007618402674905828, "grad_norm": 2.5148186683654785, "learning_rate": 3e-05, "loss": 66.6682, "step": 3 }, { "epoch": 0.0010157870233207772, "grad_norm": 2.3080971240997314, "learning_rate": 4e-05, "loss": 66.5837, "step": 4 }, { "epoch": 0.0012697337791509713, "grad_norm": 2.354560613632202, "learning_rate": 5e-05, "loss": 66.6436, "step": 5 }, { "epoch": 0.0015236805349811656, "grad_norm": 2.5473434925079346, "learning_rate": 6e-05, "loss": 66.6068, "step": 6 }, { "epoch": 0.0015236805349811656, "eval_loss": 11.101515769958496, "eval_runtime": 18.1722, "eval_samples_per_second": 273.77, "eval_steps_per_second": 68.456, "step": 6 }, { "epoch": 0.00177762729081136, "grad_norm": 2.5556399822235107, "learning_rate": 7e-05, "loss": 66.6045, "step": 7 }, { "epoch": 0.0020315740466415543, "grad_norm": 2.6539461612701416, "learning_rate": 8e-05, "loss": 66.6228, "step": 8 }, { "epoch": 0.0022855208024717484, "grad_norm": 2.602025032043457, "learning_rate": 9e-05, "loss": 66.4995, "step": 9 }, { "epoch": 0.0025394675583019426, "grad_norm": 2.4404804706573486, "learning_rate": 0.0001, "loss": 66.5529, "step": 10 }, { "epoch": 0.002793414314132137, "grad_norm": 2.2321112155914307, "learning_rate": 9.98458666866564e-05, "loss": 66.4916, "step": 11 }, { "epoch": 0.0030473610699623312, "grad_norm": 2.796868085861206, "learning_rate": 9.938441702975689e-05, "loss": 66.4966, "step": 12 }, { "epoch": 0.0030473610699623312, "eval_loss": 11.079522132873535, "eval_runtime": 18.1003, "eval_samples_per_second": 274.858, "eval_steps_per_second": 68.728, "step": 12 }, { "epoch": 0.0033013078257925254, "grad_norm": 2.591143846511841, "learning_rate": 9.861849601988383e-05, "loss": 66.4451, "step": 13 }, { "epoch": 0.00355525458162272, "grad_norm": 2.323777675628662, "learning_rate": 9.755282581475769e-05, "loss": 66.4647, "step": 14 }, { "epoch": 0.003809201337452914, "grad_norm": 2.9256598949432373, "learning_rate": 9.619397662556435e-05, "loss": 66.3829, "step": 15 }, { "epoch": 0.004063148093283109, "grad_norm": 2.4250593185424805, "learning_rate": 9.45503262094184e-05, "loss": 66.4283, "step": 16 }, { "epoch": 0.004317094849113303, "grad_norm": 2.607290029525757, "learning_rate": 9.263200821770461e-05, "loss": 66.2636, "step": 17 }, { "epoch": 0.004571041604943497, "grad_norm": 2.8356802463531494, "learning_rate": 9.045084971874738e-05, "loss": 66.2697, "step": 18 }, { "epoch": 0.004571041604943497, "eval_loss": 11.051901817321777, "eval_runtime": 18.1634, "eval_samples_per_second": 273.902, "eval_steps_per_second": 68.489, "step": 18 }, { "epoch": 0.004824988360773691, "grad_norm": 2.4801783561706543, "learning_rate": 8.802029828000156e-05, "loss": 66.3638, "step": 19 }, { "epoch": 0.005078935116603885, "grad_norm": 2.5052285194396973, "learning_rate": 8.535533905932738e-05, "loss": 66.2984, "step": 20 }, { "epoch": 0.005332881872434079, "grad_norm": 2.5191738605499268, "learning_rate": 8.247240241650918e-05, "loss": 66.3049, "step": 21 }, { "epoch": 0.005586828628264274, "grad_norm": 2.384291887283325, "learning_rate": 7.938926261462366e-05, "loss": 66.2243, "step": 22 }, { "epoch": 0.005840775384094468, "grad_norm": 2.3815808296203613, "learning_rate": 7.612492823579745e-05, "loss": 66.3124, "step": 23 }, { "epoch": 0.0060947221399246625, "grad_norm": 2.601182222366333, "learning_rate": 7.269952498697734e-05, "loss": 66.2157, "step": 24 }, { "epoch": 0.0060947221399246625, "eval_loss": 11.026150703430176, "eval_runtime": 18.1547, "eval_samples_per_second": 274.034, "eval_steps_per_second": 68.522, "step": 24 }, { "epoch": 0.006348668895754857, "grad_norm": 2.959272861480713, "learning_rate": 6.91341716182545e-05, "loss": 66.0543, "step": 25 }, { "epoch": 0.006602615651585051, "grad_norm": 2.5673084259033203, "learning_rate": 6.545084971874738e-05, "loss": 66.0834, "step": 26 }, { "epoch": 0.006856562407415245, "grad_norm": 2.866518259048462, "learning_rate": 6.167226819279528e-05, "loss": 66.044, "step": 27 }, { "epoch": 0.00711050916324544, "grad_norm": 2.417178153991699, "learning_rate": 5.782172325201155e-05, "loss": 66.1386, "step": 28 }, { "epoch": 0.007364455919075634, "grad_norm": 2.504648447036743, "learning_rate": 5.392295478639225e-05, "loss": 66.046, "step": 29 }, { "epoch": 0.007618402674905828, "grad_norm": 2.539602279663086, "learning_rate": 5e-05, "loss": 66.0429, "step": 30 }, { "epoch": 0.007618402674905828, "eval_loss": 11.006031036376953, "eval_runtime": 18.2105, "eval_samples_per_second": 273.194, "eval_steps_per_second": 68.312, "step": 30 }, { "epoch": 0.007872349430736022, "grad_norm": 2.809628963470459, "learning_rate": 4.607704521360776e-05, "loss": 65.9401, "step": 31 }, { "epoch": 0.008126296186566217, "grad_norm": 2.533236026763916, "learning_rate": 4.2178276747988446e-05, "loss": 66.0943, "step": 32 }, { "epoch": 0.00838024294239641, "grad_norm": 2.7511136531829834, "learning_rate": 3.832773180720475e-05, "loss": 65.9569, "step": 33 }, { "epoch": 0.008634189698226605, "grad_norm": 2.529318332672119, "learning_rate": 3.4549150281252636e-05, "loss": 65.9841, "step": 34 }, { "epoch": 0.008888136454056799, "grad_norm": 2.60366153717041, "learning_rate": 3.086582838174551e-05, "loss": 65.9559, "step": 35 }, { "epoch": 0.009142083209886994, "grad_norm": 2.6384119987487793, "learning_rate": 2.7300475013022663e-05, "loss": 65.9892, "step": 36 }, { "epoch": 0.009142083209886994, "eval_loss": 10.992709159851074, "eval_runtime": 18.2216, "eval_samples_per_second": 273.028, "eval_steps_per_second": 68.271, "step": 36 }, { "epoch": 0.009396029965717189, "grad_norm": 2.68752121925354, "learning_rate": 2.3875071764202563e-05, "loss": 65.8811, "step": 37 }, { "epoch": 0.009649976721547382, "grad_norm": 2.529193162918091, "learning_rate": 2.061073738537635e-05, "loss": 65.97, "step": 38 }, { "epoch": 0.009903923477377577, "grad_norm": 2.684138059616089, "learning_rate": 1.7527597583490822e-05, "loss": 65.8963, "step": 39 }, { "epoch": 0.01015787023320777, "grad_norm": 2.5320022106170654, "learning_rate": 1.4644660940672627e-05, "loss": 65.9401, "step": 40 }, { "epoch": 0.010411816989037965, "grad_norm": 2.516324281692505, "learning_rate": 1.1979701719998453e-05, "loss": 65.9055, "step": 41 }, { "epoch": 0.010665763744868158, "grad_norm": 2.543747901916504, "learning_rate": 9.549150281252633e-06, "loss": 65.8852, "step": 42 }, { "epoch": 0.010665763744868158, "eval_loss": 10.98641300201416, "eval_runtime": 18.1929, "eval_samples_per_second": 273.459, "eval_steps_per_second": 68.378, "step": 42 }, { "epoch": 0.010919710500698353, "grad_norm": 2.3728582859039307, "learning_rate": 7.367991782295391e-06, "loss": 66.1291, "step": 43 }, { "epoch": 0.011173657256528548, "grad_norm": 2.4906511306762695, "learning_rate": 5.449673790581611e-06, "loss": 65.9421, "step": 44 }, { "epoch": 0.011427604012358742, "grad_norm": 2.665292978286743, "learning_rate": 3.8060233744356633e-06, "loss": 65.9262, "step": 45 }, { "epoch": 0.011681550768188937, "grad_norm": 2.6886563301086426, "learning_rate": 2.4471741852423237e-06, "loss": 65.881, "step": 46 }, { "epoch": 0.01193549752401913, "grad_norm": 2.69474196434021, "learning_rate": 1.3815039801161721e-06, "loss": 65.8305, "step": 47 }, { "epoch": 0.012189444279849325, "grad_norm": 2.3172616958618164, "learning_rate": 6.15582970243117e-07, "loss": 65.9682, "step": 48 }, { "epoch": 0.012189444279849325, "eval_loss": 10.984763145446777, "eval_runtime": 18.4652, "eval_samples_per_second": 269.425, "eval_steps_per_second": 67.37, "step": 48 }, { "epoch": 0.01244339103567952, "grad_norm": 2.542539119720459, "learning_rate": 1.5413331334360182e-07, "loss": 66.0153, "step": 49 }, { "epoch": 0.012697337791509713, "grad_norm": 2.630923271179199, "learning_rate": 0.0, "loss": 65.7818, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7145835724800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }