{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.030214903501151944, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006042980700230389, "grad_norm": 0.5589237809181213, "learning_rate": 1e-05, "loss": 3.1692, "step": 1 }, { "epoch": 0.0006042980700230389, "eval_loss": 3.1913259029388428, "eval_runtime": 229.6838, "eval_samples_per_second": 12.134, "eval_steps_per_second": 6.069, "step": 1 }, { "epoch": 0.0012085961400460777, "grad_norm": 0.4362795948982239, "learning_rate": 2e-05, "loss": 2.8003, "step": 2 }, { "epoch": 0.0018128942100691166, "grad_norm": 0.4628395438194275, "learning_rate": 3e-05, "loss": 3.3741, "step": 3 }, { "epoch": 0.0024171922800921555, "grad_norm": 0.4753565788269043, "learning_rate": 4e-05, "loss": 2.9939, "step": 4 }, { "epoch": 0.003021490350115194, "grad_norm": 0.5600993037223816, "learning_rate": 5e-05, "loss": 3.2337, "step": 5 }, { "epoch": 0.0036257884201382332, "grad_norm": 0.4630330801010132, "learning_rate": 6e-05, "loss": 2.8722, "step": 6 }, { "epoch": 0.004230086490161272, "grad_norm": 0.551735520362854, "learning_rate": 7e-05, "loss": 3.0785, "step": 7 }, { "epoch": 0.004834384560184311, "grad_norm": 0.5148279666900635, "learning_rate": 8e-05, "loss": 2.8164, "step": 8 }, { "epoch": 0.00543868263020735, "grad_norm": 0.7112084627151489, "learning_rate": 9e-05, "loss": 3.3676, "step": 9 }, { "epoch": 0.006042980700230388, "grad_norm": 0.5696298480033875, "learning_rate": 0.0001, "loss": 2.7856, "step": 10 }, { "epoch": 0.006647278770253427, "grad_norm": 0.6853129863739014, "learning_rate": 9.98458666866564e-05, "loss": 3.1526, "step": 11 }, { "epoch": 0.0072515768402764665, "grad_norm": 0.6386879086494446, "learning_rate": 9.938441702975689e-05, "loss": 2.706, "step": 12 }, { "epoch": 0.007855874910299505, "grad_norm": 0.7042403817176819, "learning_rate": 9.861849601988383e-05, "loss": 2.5668, "step": 13 }, { "epoch": 0.007855874910299505, "eval_loss": 2.8041810989379883, "eval_runtime": 78.0564, "eval_samples_per_second": 35.705, "eval_steps_per_second": 17.859, "step": 13 }, { "epoch": 0.008460172980322544, "grad_norm": 0.7668046355247498, "learning_rate": 9.755282581475769e-05, "loss": 2.9344, "step": 14 }, { "epoch": 0.009064471050345583, "grad_norm": 0.8007228374481201, "learning_rate": 9.619397662556435e-05, "loss": 2.8764, "step": 15 }, { "epoch": 0.009668769120368622, "grad_norm": 0.6990913152694702, "learning_rate": 9.45503262094184e-05, "loss": 2.4552, "step": 16 }, { "epoch": 0.010273067190391661, "grad_norm": 0.6952396035194397, "learning_rate": 9.263200821770461e-05, "loss": 2.5646, "step": 17 }, { "epoch": 0.0108773652604147, "grad_norm": 0.7194958925247192, "learning_rate": 9.045084971874738e-05, "loss": 2.2515, "step": 18 }, { "epoch": 0.01148166333043774, "grad_norm": 0.7544381618499756, "learning_rate": 8.802029828000156e-05, "loss": 2.5523, "step": 19 }, { "epoch": 0.012085961400460777, "grad_norm": 0.7936828136444092, "learning_rate": 8.535533905932738e-05, "loss": 2.4444, "step": 20 }, { "epoch": 0.012690259470483816, "grad_norm": 0.7764612436294556, "learning_rate": 8.247240241650918e-05, "loss": 2.3171, "step": 21 }, { "epoch": 0.013294557540506855, "grad_norm": 0.9252168536186218, "learning_rate": 7.938926261462366e-05, "loss": 2.5806, "step": 22 }, { "epoch": 0.013898855610529894, "grad_norm": 0.8082977533340454, "learning_rate": 7.612492823579745e-05, "loss": 2.2255, "step": 23 }, { "epoch": 0.014503153680552933, "grad_norm": 0.8068621158599854, "learning_rate": 7.269952498697734e-05, "loss": 2.1052, "step": 24 }, { "epoch": 0.015107451750575972, "grad_norm": 0.876987874507904, "learning_rate": 6.91341716182545e-05, "loss": 2.0828, "step": 25 }, { "epoch": 0.01571174982059901, "grad_norm": 0.9265044331550598, "learning_rate": 6.545084971874738e-05, "loss": 2.1888, "step": 26 }, { "epoch": 0.01571174982059901, "eval_loss": 2.1405892372131348, "eval_runtime": 77.9878, "eval_samples_per_second": 35.736, "eval_steps_per_second": 17.875, "step": 26 }, { "epoch": 0.01631604789062205, "grad_norm": 0.8348265290260315, "learning_rate": 6.167226819279528e-05, "loss": 2.0727, "step": 27 }, { "epoch": 0.016920345960645088, "grad_norm": 0.8782357573509216, "learning_rate": 5.782172325201155e-05, "loss": 2.1957, "step": 28 }, { "epoch": 0.017524644030668127, "grad_norm": 0.8670806884765625, "learning_rate": 5.392295478639225e-05, "loss": 1.9359, "step": 29 }, { "epoch": 0.018128942100691166, "grad_norm": 0.9028540849685669, "learning_rate": 5e-05, "loss": 1.9664, "step": 30 }, { "epoch": 0.018733240170714205, "grad_norm": 0.8907434940338135, "learning_rate": 4.607704521360776e-05, "loss": 1.969, "step": 31 }, { "epoch": 0.019337538240737244, "grad_norm": 0.8233531713485718, "learning_rate": 4.2178276747988446e-05, "loss": 1.974, "step": 32 }, { "epoch": 0.019941836310760283, "grad_norm": 0.8587433099746704, "learning_rate": 3.832773180720475e-05, "loss": 1.8521, "step": 33 }, { "epoch": 0.020546134380783322, "grad_norm": 0.897334098815918, "learning_rate": 3.4549150281252636e-05, "loss": 1.9035, "step": 34 }, { "epoch": 0.02115043245080636, "grad_norm": 0.9631185531616211, "learning_rate": 3.086582838174551e-05, "loss": 1.9148, "step": 35 }, { "epoch": 0.0217547305208294, "grad_norm": 1.0168321132659912, "learning_rate": 2.7300475013022663e-05, "loss": 1.9873, "step": 36 }, { "epoch": 0.02235902859085244, "grad_norm": 0.8644974827766418, "learning_rate": 2.3875071764202563e-05, "loss": 1.8347, "step": 37 }, { "epoch": 0.02296332666087548, "grad_norm": 0.8866729140281677, "learning_rate": 2.061073738537635e-05, "loss": 2.0322, "step": 38 }, { "epoch": 0.023567624730898514, "grad_norm": 0.9013758897781372, "learning_rate": 1.7527597583490822e-05, "loss": 1.8802, "step": 39 }, { "epoch": 0.023567624730898514, "eval_loss": 1.8680973052978516, "eval_runtime": 78.0813, "eval_samples_per_second": 35.694, "eval_steps_per_second": 17.853, "step": 39 }, { "epoch": 0.024171922800921553, "grad_norm": 0.877164363861084, "learning_rate": 1.4644660940672627e-05, "loss": 1.7625, "step": 40 }, { "epoch": 0.024776220870944592, "grad_norm": 0.9773980975151062, "learning_rate": 1.1979701719998453e-05, "loss": 1.9383, "step": 41 }, { "epoch": 0.02538051894096763, "grad_norm": 0.8631694316864014, "learning_rate": 9.549150281252633e-06, "loss": 1.8411, "step": 42 }, { "epoch": 0.02598481701099067, "grad_norm": 0.8837679624557495, "learning_rate": 7.367991782295391e-06, "loss": 1.8574, "step": 43 }, { "epoch": 0.02658911508101371, "grad_norm": 0.9769808650016785, "learning_rate": 5.449673790581611e-06, "loss": 1.9396, "step": 44 }, { "epoch": 0.02719341315103675, "grad_norm": 0.948658287525177, "learning_rate": 3.8060233744356633e-06, "loss": 2.0352, "step": 45 }, { "epoch": 0.027797711221059788, "grad_norm": 0.9376592040061951, "learning_rate": 2.4471741852423237e-06, "loss": 1.7759, "step": 46 }, { "epoch": 0.028402009291082827, "grad_norm": 0.81429523229599, "learning_rate": 1.3815039801161721e-06, "loss": 1.9055, "step": 47 }, { "epoch": 0.029006307361105866, "grad_norm": 0.9344580769538879, "learning_rate": 6.15582970243117e-07, "loss": 2.015, "step": 48 }, { "epoch": 0.029610605431128905, "grad_norm": 0.8902539014816284, "learning_rate": 1.5413331334360182e-07, "loss": 1.7384, "step": 49 }, { "epoch": 0.030214903501151944, "grad_norm": 0.9016924500465393, "learning_rate": 0.0, "loss": 1.6748, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.509070769258496e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }