|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 50, |
|
"global_step": 760, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005263157894736842, |
|
"eval_loss": 0.32585662603378296, |
|
"eval_runtime": 13.4514, |
|
"eval_samples_per_second": 148.684, |
|
"eval_steps_per_second": 0.149, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00019978517722878627, |
|
"loss": 0.2249, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.00019548872180451127, |
|
"loss": 0.1193, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"eval_loss": 0.07482124119997025, |
|
"eval_runtime": 12.7644, |
|
"eval_samples_per_second": 156.686, |
|
"eval_steps_per_second": 0.157, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 0.0001911922663802363, |
|
"loss": 0.1033, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.00018689581095596134, |
|
"loss": 0.0973, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 0.00018259935553168637, |
|
"loss": 0.0932, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"eval_loss": 0.06730526685714722, |
|
"eval_runtime": 12.8198, |
|
"eval_samples_per_second": 156.009, |
|
"eval_steps_per_second": 0.156, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 0.0001783029001074114, |
|
"loss": 0.0894, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.00017400644468313644, |
|
"loss": 0.0881, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"eval_loss": 0.06297393888235092, |
|
"eval_runtime": 12.7629, |
|
"eval_samples_per_second": 156.704, |
|
"eval_steps_per_second": 0.157, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 0.00016970998925886144, |
|
"loss": 0.0862, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 0.00016541353383458648, |
|
"loss": 0.084, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 0.00016111707841031148, |
|
"loss": 0.08, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"eval_loss": 0.05776415765285492, |
|
"eval_runtime": 12.8264, |
|
"eval_samples_per_second": 155.929, |
|
"eval_steps_per_second": 0.156, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.00015682062298603652, |
|
"loss": 0.0759, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.00015252416756176155, |
|
"loss": 0.0757, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"eval_loss": 0.05684072896838188, |
|
"eval_runtime": 12.8684, |
|
"eval_samples_per_second": 155.42, |
|
"eval_steps_per_second": 0.155, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 0.00014822771213748658, |
|
"loss": 0.0745, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 0.00014393125671321162, |
|
"loss": 0.0732, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.00013963480128893662, |
|
"loss": 0.0732, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"eval_loss": 0.057122766971588135, |
|
"eval_runtime": 12.9739, |
|
"eval_samples_per_second": 154.156, |
|
"eval_steps_per_second": 0.154, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 0.00013533834586466166, |
|
"loss": 0.0749, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.0001310418904403867, |
|
"loss": 0.0722, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8421052631578947, |
|
"eval_loss": 0.055894188582897186, |
|
"eval_runtime": 13.0028, |
|
"eval_samples_per_second": 153.813, |
|
"eval_steps_per_second": 0.154, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.00012674543501611172, |
|
"loss": 0.0705, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 0.069, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.00011815252416756178, |
|
"loss": 0.065, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"eval_loss": 0.05468890815973282, |
|
"eval_runtime": 12.8106, |
|
"eval_samples_per_second": 156.121, |
|
"eval_steps_per_second": 0.156, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.00011385606874328681, |
|
"loss": 0.0645, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.00010955961331901181, |
|
"loss": 0.0637, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.3684210526315788, |
|
"eval_loss": 0.05291734263300896, |
|
"eval_runtime": 12.8493, |
|
"eval_samples_per_second": 155.651, |
|
"eval_steps_per_second": 0.156, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 0.0652, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00010096670247046187, |
|
"loss": 0.0642, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 9.66702470461869e-05, |
|
"loss": 0.063, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"eval_loss": 0.05152459070086479, |
|
"eval_runtime": 12.8186, |
|
"eval_samples_per_second": 156.024, |
|
"eval_steps_per_second": 0.156, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 9.237379162191193e-05, |
|
"loss": 0.0635, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 8.807733619763695e-05, |
|
"loss": 0.064, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.8947368421052633, |
|
"eval_loss": 0.05137969180941582, |
|
"eval_runtime": 12.8105, |
|
"eval_samples_per_second": 156.122, |
|
"eval_steps_per_second": 0.156, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 8.378088077336199e-05, |
|
"loss": 0.0634, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.0526315789473686, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 7.9484425349087e-05, |
|
"loss": 0.0601, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 7.518796992481203e-05, |
|
"loss": 0.0612, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"eval_loss": 0.0508638396859169, |
|
"eval_runtime": 13.0917, |
|
"eval_samples_per_second": 152.768, |
|
"eval_steps_per_second": 0.153, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.263157894736842, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 7.089151450053706e-05, |
|
"loss": 0.0579, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 6.659505907626209e-05, |
|
"loss": 0.0593, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.4210526315789473, |
|
"eval_loss": 0.05003859102725983, |
|
"eval_runtime": 12.8994, |
|
"eval_samples_per_second": 155.045, |
|
"eval_steps_per_second": 0.155, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.473684210526316, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 6.229860365198711e-05, |
|
"loss": 0.0573, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.5789473684210527, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 5.800214822771214e-05, |
|
"loss": 0.0601, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 5.3705692803437166e-05, |
|
"loss": 0.057, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"eval_loss": 0.0502210296690464, |
|
"eval_runtime": 12.869, |
|
"eval_samples_per_second": 155.412, |
|
"eval_steps_per_second": 0.155, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.7894736842105265, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 4.940923737916219e-05, |
|
"loss": 0.0584, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.8947368421052633, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 4.511278195488722e-05, |
|
"loss": 0.0589, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.9473684210526314, |
|
"eval_loss": 0.049859099090099335, |
|
"eval_runtime": 12.8363, |
|
"eval_samples_per_second": 155.808, |
|
"eval_steps_per_second": 0.156, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 0.0574, |
|
"step": 760 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 950, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.580995873653182e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|