|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.853932584269663, |
|
"eval_steps": 500, |
|
"global_step": 88, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019772727272727273, |
|
"loss": 9.4365, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 10.061033248901367, |
|
"learning_rate": 0.0001931818181818182, |
|
"loss": 9.2158, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 15.205401420593262, |
|
"learning_rate": 0.00018863636363636364, |
|
"loss": 8.8062, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 20.087194442749023, |
|
"learning_rate": 0.00018409090909090909, |
|
"loss": 8.4059, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 12.80905818939209, |
|
"learning_rate": 0.00017954545454545456, |
|
"loss": 7.8943, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 13.147987365722656, |
|
"learning_rate": 0.000175, |
|
"loss": 7.5915, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 14.075699806213379, |
|
"learning_rate": 0.00017045454545454547, |
|
"loss": 7.3797, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 6.000680923461914, |
|
"learning_rate": 0.00016590909090909094, |
|
"loss": 7.3359, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 7.718422889709473, |
|
"learning_rate": 0.00016136363636363635, |
|
"loss": 7.3118, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 6.30356502532959, |
|
"learning_rate": 0.00015681818181818182, |
|
"loss": 7.2512, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 11.91089153289795, |
|
"learning_rate": 0.00015227272727272727, |
|
"loss": 7.1725, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.0449438202247192, |
|
"grad_norm": 10.328221321105957, |
|
"learning_rate": 0.00014772727272727274, |
|
"loss": 4.4411, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.1348314606741572, |
|
"grad_norm": 11.233499526977539, |
|
"learning_rate": 0.0001431818181818182, |
|
"loss": 7.1546, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.2247191011235956, |
|
"grad_norm": 12.115806579589844, |
|
"learning_rate": 0.00013863636363636365, |
|
"loss": 7.146, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.3146067415730336, |
|
"grad_norm": 8.608111381530762, |
|
"learning_rate": 0.0001340909090909091, |
|
"loss": 7.1384, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.404494382022472, |
|
"grad_norm": 6.332569599151611, |
|
"learning_rate": 0.00012954545454545456, |
|
"loss": 7.0654, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.49438202247191, |
|
"grad_norm": 8.418745994567871, |
|
"learning_rate": 0.000125, |
|
"loss": 7.025, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.5842696629213484, |
|
"grad_norm": 3.712430000305176, |
|
"learning_rate": 0.00012045454545454546, |
|
"loss": 7.0801, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.6741573033707864, |
|
"grad_norm": 10.121654510498047, |
|
"learning_rate": 0.00011590909090909093, |
|
"loss": 7.1843, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.7640449438202248, |
|
"grad_norm": 4.086280345916748, |
|
"learning_rate": 0.00011136363636363636, |
|
"loss": 7.085, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.8539325842696628, |
|
"grad_norm": 6.2060933113098145, |
|
"learning_rate": 0.00010681818181818181, |
|
"loss": 7.0862, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.9438202247191012, |
|
"grad_norm": 3.9082467555999756, |
|
"learning_rate": 0.00010227272727272727, |
|
"loss": 7.034, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.6770272254943848, |
|
"learning_rate": 9.772727272727274e-05, |
|
"loss": 4.2733, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.0898876404494384, |
|
"grad_norm": 3.2819325923919678, |
|
"learning_rate": 9.318181818181818e-05, |
|
"loss": 7.0196, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.1797752808988764, |
|
"grad_norm": 6.796480655670166, |
|
"learning_rate": 8.863636363636364e-05, |
|
"loss": 7.0643, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.2696629213483144, |
|
"grad_norm": 5.090155124664307, |
|
"learning_rate": 8.40909090909091e-05, |
|
"loss": 6.9201, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.359550561797753, |
|
"grad_norm": 9.731218338012695, |
|
"learning_rate": 7.954545454545455e-05, |
|
"loss": 7.0715, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.449438202247191, |
|
"grad_norm": 5.852891445159912, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 7.1071, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.539325842696629, |
|
"grad_norm": 9.918388366699219, |
|
"learning_rate": 7.045454545454546e-05, |
|
"loss": 7.0041, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.629213483146067, |
|
"grad_norm": 5.5014190673828125, |
|
"learning_rate": 6.59090909090909e-05, |
|
"loss": 6.982, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.7191011235955056, |
|
"grad_norm": 7.316319465637207, |
|
"learning_rate": 6.136363636363636e-05, |
|
"loss": 6.9352, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 13.674769401550293, |
|
"learning_rate": 5.6818181818181825e-05, |
|
"loss": 6.9549, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.898876404494382, |
|
"grad_norm": 6.491578102111816, |
|
"learning_rate": 5.2272727272727274e-05, |
|
"loss": 7.0187, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.98876404494382, |
|
"grad_norm": 5.663166522979736, |
|
"learning_rate": 4.772727272727273e-05, |
|
"loss": 6.9697, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 3.044943820224719, |
|
"grad_norm": 5.13517427444458, |
|
"learning_rate": 4.318181818181819e-05, |
|
"loss": 4.3462, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.134831460674157, |
|
"grad_norm": 4.548388957977295, |
|
"learning_rate": 3.8636363636363636e-05, |
|
"loss": 6.8757, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 3.2247191011235956, |
|
"grad_norm": 7.8548431396484375, |
|
"learning_rate": 3.409090909090909e-05, |
|
"loss": 6.9019, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 3.3146067415730336, |
|
"grad_norm": 18.156105041503906, |
|
"learning_rate": 2.954545454545455e-05, |
|
"loss": 7.076, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 3.404494382022472, |
|
"grad_norm": 6.122278690338135, |
|
"learning_rate": 2.5e-05, |
|
"loss": 6.9759, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 3.49438202247191, |
|
"grad_norm": 8.052112579345703, |
|
"learning_rate": 2.0454545454545457e-05, |
|
"loss": 7.1131, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.5842696629213484, |
|
"grad_norm": 18.36809730529785, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 6.8889, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 3.6741573033707864, |
|
"grad_norm": 3.2839643955230713, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 6.9873, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 3.764044943820225, |
|
"grad_norm": 7.564036846160889, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 6.9841, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 3.853932584269663, |
|
"grad_norm": 3.23893404006958, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 6.975, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 3.853932584269663, |
|
"step": 88, |
|
"total_flos": 343326692124408.0, |
|
"train_loss": 7.0837731144645, |
|
"train_runtime": 380.6612, |
|
"train_samples_per_second": 3.73, |
|
"train_steps_per_second": 0.231 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 88, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 343326692124408.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|