File size: 2,654 Bytes
3743806 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
max_seq_len: 8192
global_seed: 17
# Run Name
run_name: mpt-30b-4ep # If left blank, will be read from env var $RUN_NAME
model:
name: hf_causal_lm
pretrained: true
pretrained_model_name_or_path: manojpreveen/mpt-30b-v4
init_device: mixed
config_overrides:
max_seq_len: ${max_seq_len}
attn_config:
attn_impl: triton
# Set this to `true` if using `train_loader.dataset.packing_ratio` below
attn_uses_sequence_id: false
# Tokenizer
tokenizer:
name: manojpreveen/mpt-30b-v4
kwargs:
model_max_length: ${max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: csv
hf_kwargs:
data_dir: ~/mpt/llm-foundry/data/orca_1m_gpt4
preprocessing_fn:
split: train
max_seq_len: ${max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
# # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
packing_ratio: 19.0
shuffle: true
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
# Optimization
scheduler:
name: linear_decay_with_warmup # linear no warmup is HF default which dolly used
t_warmup: 100ba # add some warmup though, seems to help with MPT
alpha_f: 0
optimizer:
# Based on Dolly
name: decoupled_lionw
lr: 1.0e-6
betas:
- 0.9
- 0.999
eps: 1.0e-8
weight_decay: 0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 4ep # 2-3 epochs seems like the sweet spot
eval_interval: 1
# eval_subset_num_batches: -1
# eval_first: true
global_train_batch_size: 8 # somewhere in the 6-8 * numgpus range seems good
# System
seed: ${global_seed}
# device_eval_batch_size: 8
device_train_microbatch_size: 2
# device_train_microbatch_size: auto
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
verbose: false
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}
# loggers:
# wandb: {}
# Checkpoint to local filesystem or remote object store
save_interval: 1ep
save_num_checkpoints_to_keep: 4 # Important, this cleans up checkpoints saved to DISK
save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
|