{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 50, "global_step": 760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005263157894736842, "eval_loss": 0.32585662603378296, "eval_runtime": 13.4514, "eval_samples_per_second": 148.684, "eval_steps_per_second": 0.149, "step": 1 }, { "epoch": 0.10526315789473684, "grad_norm": 0.171875, "learning_rate": 0.00019978517722878627, "loss": 0.2249, "step": 20 }, { "epoch": 0.21052631578947367, "grad_norm": 0.13671875, "learning_rate": 0.00019548872180451127, "loss": 0.1193, "step": 40 }, { "epoch": 0.2631578947368421, "eval_loss": 0.07482124119997025, "eval_runtime": 12.7644, "eval_samples_per_second": 156.686, "eval_steps_per_second": 0.157, "step": 50 }, { "epoch": 0.3157894736842105, "grad_norm": 0.1201171875, "learning_rate": 0.0001911922663802363, "loss": 0.1033, "step": 60 }, { "epoch": 0.42105263157894735, "grad_norm": 0.138671875, "learning_rate": 0.00018689581095596134, "loss": 0.0973, "step": 80 }, { "epoch": 0.5263157894736842, "grad_norm": 0.1279296875, "learning_rate": 0.00018259935553168637, "loss": 0.0932, "step": 100 }, { "epoch": 0.5263157894736842, "eval_loss": 0.06730526685714722, "eval_runtime": 12.8198, "eval_samples_per_second": 156.009, "eval_steps_per_second": 0.156, "step": 100 }, { "epoch": 0.631578947368421, "grad_norm": 0.10205078125, "learning_rate": 0.0001783029001074114, "loss": 0.0894, "step": 120 }, { "epoch": 0.7368421052631579, "grad_norm": 0.0947265625, "learning_rate": 0.00017400644468313644, "loss": 0.0881, "step": 140 }, { "epoch": 0.7894736842105263, "eval_loss": 0.06297393888235092, "eval_runtime": 12.7629, "eval_samples_per_second": 156.704, "eval_steps_per_second": 0.157, "step": 150 }, { "epoch": 0.8421052631578947, "grad_norm": 0.11767578125, "learning_rate": 0.00016970998925886144, "loss": 0.0862, "step": 160 }, { "epoch": 0.9473684210526315, "grad_norm": 0.12060546875, "learning_rate": 0.00016541353383458648, "loss": 0.084, "step": 180 }, { "epoch": 1.0526315789473684, "grad_norm": 0.10888671875, "learning_rate": 0.00016111707841031148, "loss": 0.08, "step": 200 }, { "epoch": 1.0526315789473684, "eval_loss": 0.05776415765285492, "eval_runtime": 12.8264, "eval_samples_per_second": 155.929, "eval_steps_per_second": 0.156, "step": 200 }, { "epoch": 1.1578947368421053, "grad_norm": 0.091796875, "learning_rate": 0.00015682062298603652, "loss": 0.0759, "step": 220 }, { "epoch": 1.263157894736842, "grad_norm": 0.1044921875, "learning_rate": 0.00015252416756176155, "loss": 0.0757, "step": 240 }, { "epoch": 1.3157894736842106, "eval_loss": 0.05684072896838188, "eval_runtime": 12.8684, "eval_samples_per_second": 155.42, "eval_steps_per_second": 0.155, "step": 250 }, { "epoch": 1.368421052631579, "grad_norm": 0.09326171875, "learning_rate": 0.00014822771213748658, "loss": 0.0745, "step": 260 }, { "epoch": 1.4736842105263157, "grad_norm": 0.1005859375, "learning_rate": 0.00014393125671321162, "loss": 0.0732, "step": 280 }, { "epoch": 1.5789473684210527, "grad_norm": 0.095703125, "learning_rate": 0.00013963480128893662, "loss": 0.0732, "step": 300 }, { "epoch": 1.5789473684210527, "eval_loss": 0.057122766971588135, "eval_runtime": 12.9739, "eval_samples_per_second": 154.156, "eval_steps_per_second": 0.154, "step": 300 }, { "epoch": 1.6842105263157894, "grad_norm": 0.1142578125, "learning_rate": 0.00013533834586466166, "loss": 0.0749, "step": 320 }, { "epoch": 1.7894736842105263, "grad_norm": 0.087890625, "learning_rate": 0.0001310418904403867, "loss": 0.0722, "step": 340 }, { "epoch": 1.8421052631578947, "eval_loss": 0.055894188582897186, "eval_runtime": 13.0028, "eval_samples_per_second": 153.813, "eval_steps_per_second": 0.154, "step": 350 }, { "epoch": 1.8947368421052633, "grad_norm": 0.076171875, "learning_rate": 0.00012674543501611172, "loss": 0.0705, "step": 360 }, { "epoch": 2.0, "grad_norm": 0.072265625, "learning_rate": 0.00012244897959183676, "loss": 0.069, "step": 380 }, { "epoch": 2.1052631578947367, "grad_norm": 0.08251953125, "learning_rate": 0.00011815252416756178, "loss": 0.065, "step": 400 }, { "epoch": 2.1052631578947367, "eval_loss": 0.05468890815973282, "eval_runtime": 12.8106, "eval_samples_per_second": 156.121, "eval_steps_per_second": 0.156, "step": 400 }, { "epoch": 2.2105263157894735, "grad_norm": 0.08642578125, "learning_rate": 0.00011385606874328681, "loss": 0.0645, "step": 420 }, { "epoch": 2.3157894736842106, "grad_norm": 0.09130859375, "learning_rate": 0.00010955961331901181, "loss": 0.0637, "step": 440 }, { "epoch": 2.3684210526315788, "eval_loss": 0.05291734263300896, "eval_runtime": 12.8493, "eval_samples_per_second": 155.651, "eval_steps_per_second": 0.156, "step": 450 }, { "epoch": 2.4210526315789473, "grad_norm": 0.1005859375, "learning_rate": 0.00010526315789473685, "loss": 0.0652, "step": 460 }, { "epoch": 2.526315789473684, "grad_norm": 0.08203125, "learning_rate": 0.00010096670247046187, "loss": 0.0642, "step": 480 }, { "epoch": 2.6315789473684212, "grad_norm": 0.09130859375, "learning_rate": 9.66702470461869e-05, "loss": 0.063, "step": 500 }, { "epoch": 2.6315789473684212, "eval_loss": 0.05152459070086479, "eval_runtime": 12.8186, "eval_samples_per_second": 156.024, "eval_steps_per_second": 0.156, "step": 500 }, { "epoch": 2.736842105263158, "grad_norm": 0.19140625, "learning_rate": 9.237379162191193e-05, "loss": 0.0635, "step": 520 }, { "epoch": 2.8421052631578947, "grad_norm": 0.09716796875, "learning_rate": 8.807733619763695e-05, "loss": 0.064, "step": 540 }, { "epoch": 2.8947368421052633, "eval_loss": 0.05137969180941582, "eval_runtime": 12.8105, "eval_samples_per_second": 156.122, "eval_steps_per_second": 0.156, "step": 550 }, { "epoch": 2.9473684210526314, "grad_norm": 0.0849609375, "learning_rate": 8.378088077336199e-05, "loss": 0.0634, "step": 560 }, { "epoch": 3.0526315789473686, "grad_norm": 0.1552734375, "learning_rate": 7.9484425349087e-05, "loss": 0.0601, "step": 580 }, { "epoch": 3.1578947368421053, "grad_norm": 0.1015625, "learning_rate": 7.518796992481203e-05, "loss": 0.0612, "step": 600 }, { "epoch": 3.1578947368421053, "eval_loss": 0.0508638396859169, "eval_runtime": 13.0917, "eval_samples_per_second": 152.768, "eval_steps_per_second": 0.153, "step": 600 }, { "epoch": 3.263157894736842, "grad_norm": 0.09326171875, "learning_rate": 7.089151450053706e-05, "loss": 0.0579, "step": 620 }, { "epoch": 3.3684210526315788, "grad_norm": 0.10009765625, "learning_rate": 6.659505907626209e-05, "loss": 0.0593, "step": 640 }, { "epoch": 3.4210526315789473, "eval_loss": 0.05003859102725983, "eval_runtime": 12.8994, "eval_samples_per_second": 155.045, "eval_steps_per_second": 0.155, "step": 650 }, { "epoch": 3.473684210526316, "grad_norm": 0.087890625, "learning_rate": 6.229860365198711e-05, "loss": 0.0573, "step": 660 }, { "epoch": 3.5789473684210527, "grad_norm": 0.087890625, "learning_rate": 5.800214822771214e-05, "loss": 0.0601, "step": 680 }, { "epoch": 3.6842105263157894, "grad_norm": 0.08544921875, "learning_rate": 5.3705692803437166e-05, "loss": 0.057, "step": 700 }, { "epoch": 3.6842105263157894, "eval_loss": 0.0502210296690464, "eval_runtime": 12.869, "eval_samples_per_second": 155.412, "eval_steps_per_second": 0.155, "step": 700 }, { "epoch": 3.7894736842105265, "grad_norm": 0.09716796875, "learning_rate": 4.940923737916219e-05, "loss": 0.0584, "step": 720 }, { "epoch": 3.8947368421052633, "grad_norm": 0.083984375, "learning_rate": 4.511278195488722e-05, "loss": 0.0589, "step": 740 }, { "epoch": 3.9473684210526314, "eval_loss": 0.049859099090099335, "eval_runtime": 12.8363, "eval_samples_per_second": 155.808, "eval_steps_per_second": 0.156, "step": 750 }, { "epoch": 4.0, "grad_norm": 0.09033203125, "learning_rate": 4.0816326530612245e-05, "loss": 0.0574, "step": 760 } ], "logging_steps": 20, "max_steps": 950, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.580995873653182e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }