{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008510276158461342, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.510276158461342e-05, "eval_loss": 1.4265342950820923, "eval_runtime": 1422.7176, "eval_samples_per_second": 13.911, "eval_steps_per_second": 1.739, "step": 1 }, { "epoch": 0.0002553082847538403, "grad_norm": 7.908364295959473, "learning_rate": 1.5e-05, "loss": 5.6974, "step": 3 }, { "epoch": 0.0005106165695076806, "grad_norm": 6.230061054229736, "learning_rate": 3e-05, "loss": 5.4338, "step": 6 }, { "epoch": 0.0007659248542615208, "grad_norm": 3.978466272354126, "learning_rate": 4.5e-05, "loss": 5.2112, "step": 9 }, { "epoch": 0.0007659248542615208, "eval_loss": 1.2634788751602173, "eval_runtime": 1429.6296, "eval_samples_per_second": 13.843, "eval_steps_per_second": 1.731, "step": 9 }, { "epoch": 0.0010212331390153612, "grad_norm": 4.294009685516357, "learning_rate": 4.993910125649561e-05, "loss": 5.0675, "step": 12 }, { "epoch": 0.0012765414237692013, "grad_norm": 2.9980828762054443, "learning_rate": 4.962019382530521e-05, "loss": 4.8171, "step": 15 }, { "epoch": 0.0015318497085230415, "grad_norm": 3.0869834423065186, "learning_rate": 4.9031542398457974e-05, "loss": 4.7292, "step": 18 }, { "epoch": 0.0015318497085230415, "eval_loss": 1.1791282892227173, "eval_runtime": 1430.0647, "eval_samples_per_second": 13.839, "eval_steps_per_second": 1.73, "step": 18 }, { "epoch": 0.001787157993276882, "grad_norm": 2.754807949066162, "learning_rate": 4.817959636416969e-05, "loss": 4.6937, "step": 21 }, { "epoch": 0.0020424662780307223, "grad_norm": 2.638349771499634, "learning_rate": 4.707368982147318e-05, "loss": 4.5978, "step": 24 }, { "epoch": 0.0022977745627845623, "grad_norm": 2.762077569961548, "learning_rate": 4.572593931387604e-05, "loss": 4.6333, "step": 27 }, { "epoch": 0.0022977745627845623, "eval_loss": 1.1523500680923462, "eval_runtime": 1430.0767, "eval_samples_per_second": 13.839, "eval_steps_per_second": 1.73, "step": 27 }, { "epoch": 0.0025530828475384027, "grad_norm": 2.646472692489624, "learning_rate": 4.415111107797445e-05, "loss": 4.5757, "step": 30 }, { "epoch": 0.002808391132292243, "grad_norm": 2.671572685241699, "learning_rate": 4.2366459261474933e-05, "loss": 4.6269, "step": 33 }, { "epoch": 0.003063699417046083, "grad_norm": 2.880744218826294, "learning_rate": 4.039153688314145e-05, "loss": 4.5438, "step": 36 }, { "epoch": 0.003063699417046083, "eval_loss": 1.1377136707305908, "eval_runtime": 1429.7895, "eval_samples_per_second": 13.842, "eval_steps_per_second": 1.73, "step": 36 }, { "epoch": 0.0033190077017999234, "grad_norm": 2.7430524826049805, "learning_rate": 3.824798160583012e-05, "loss": 4.627, "step": 39 }, { "epoch": 0.003574315986553764, "grad_norm": 2.8961098194122314, "learning_rate": 3.5959278669726935e-05, "loss": 4.5318, "step": 42 }, { "epoch": 0.0038296242713076038, "grad_norm": 2.7629406452178955, "learning_rate": 3.355050358314172e-05, "loss": 4.4218, "step": 45 }, { "epoch": 0.0038296242713076038, "eval_loss": 1.129303216934204, "eval_runtime": 1430.1322, "eval_samples_per_second": 13.839, "eval_steps_per_second": 1.73, "step": 45 }, { "epoch": 0.004084932556061445, "grad_norm": 2.760894536972046, "learning_rate": 3.104804738999169e-05, "loss": 4.3762, "step": 48 }, { "epoch": 0.004340240840815284, "grad_norm": 2.6783533096313477, "learning_rate": 2.8479327524001636e-05, "loss": 4.559, "step": 51 }, { "epoch": 0.0045955491255691245, "grad_norm": 2.5721302032470703, "learning_rate": 2.587248741756253e-05, "loss": 4.5641, "step": 54 }, { "epoch": 0.0045955491255691245, "eval_loss": 1.1213735342025757, "eval_runtime": 1430.3753, "eval_samples_per_second": 13.836, "eval_steps_per_second": 1.73, "step": 54 }, { "epoch": 0.004850857410322965, "grad_norm": 2.695680618286133, "learning_rate": 2.3256088156396868e-05, "loss": 4.4795, "step": 57 }, { "epoch": 0.005106165695076805, "grad_norm": 2.6988542079925537, "learning_rate": 2.0658795558326743e-05, "loss": 4.433, "step": 60 }, { "epoch": 0.005361473979830646, "grad_norm": 2.7334961891174316, "learning_rate": 1.8109066104575023e-05, "loss": 4.4115, "step": 63 }, { "epoch": 0.005361473979830646, "eval_loss": 1.1170847415924072, "eval_runtime": 1430.6721, "eval_samples_per_second": 13.833, "eval_steps_per_second": 1.729, "step": 63 }, { "epoch": 0.005616782264584486, "grad_norm": 2.718637704849243, "learning_rate": 1.56348351646022e-05, "loss": 4.4666, "step": 66 }, { "epoch": 0.005872090549338326, "grad_norm": 2.9299156665802, "learning_rate": 1.3263210930352737e-05, "loss": 4.4289, "step": 69 }, { "epoch": 0.006127398834092166, "grad_norm": 2.732020616531372, "learning_rate": 1.1020177413231334e-05, "loss": 4.3213, "step": 72 }, { "epoch": 0.006127398834092166, "eval_loss": 1.1137700080871582, "eval_runtime": 1430.87, "eval_samples_per_second": 13.831, "eval_steps_per_second": 1.729, "step": 72 }, { "epoch": 0.0063827071188460064, "grad_norm": 2.8953986167907715, "learning_rate": 8.930309757836517e-06, "loss": 4.4973, "step": 75 }, { "epoch": 0.006638015403599847, "grad_norm": 2.6640074253082275, "learning_rate": 7.016504991533726e-06, "loss": 4.3667, "step": 78 }, { "epoch": 0.006893323688353687, "grad_norm": 2.7051126956939697, "learning_rate": 5.299731159831953e-06, "loss": 4.3751, "step": 81 }, { "epoch": 0.006893323688353687, "eval_loss": 1.1114040613174438, "eval_runtime": 1430.9038, "eval_samples_per_second": 13.831, "eval_steps_per_second": 1.729, "step": 81 }, { "epoch": 0.007148631973107528, "grad_norm": 2.845485210418701, "learning_rate": 3.798797596089351e-06, "loss": 4.537, "step": 84 }, { "epoch": 0.007403940257861368, "grad_norm": 2.637657642364502, "learning_rate": 2.5301488425208296e-06, "loss": 4.4016, "step": 87 }, { "epoch": 0.0076592485426152076, "grad_norm": 2.7435178756713867, "learning_rate": 1.5076844803522922e-06, "loss": 4.399, "step": 90 }, { "epoch": 0.0076592485426152076, "eval_loss": 1.1104978322982788, "eval_runtime": 1430.9556, "eval_samples_per_second": 13.831, "eval_steps_per_second": 1.729, "step": 90 }, { "epoch": 0.007914556827369048, "grad_norm": 2.8266332149505615, "learning_rate": 7.426068431000882e-07, "loss": 4.5159, "step": 93 }, { "epoch": 0.00816986511212289, "grad_norm": 2.816528081893921, "learning_rate": 2.4329828146074095e-07, "loss": 4.5103, "step": 96 }, { "epoch": 0.008425173396876729, "grad_norm": 2.7482688426971436, "learning_rate": 1.522932452260595e-08, "loss": 4.4013, "step": 99 }, { "epoch": 0.008425173396876729, "eval_loss": 1.1102831363677979, "eval_runtime": 1430.7905, "eval_samples_per_second": 13.832, "eval_steps_per_second": 1.729, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.406260607975424e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }