mpt-30b-v3 / mpt-30b_orca.yaml
manojpreveen's picture
Create mpt-30b_orca.yaml
430185e
max_seq_len: 8192
global_seed: 17
# Run Name
run_name: mpt-30b-orca-1ep # If left blank, will be read from env var $RUN_NAME
model:
name: hf_causal_lm
pretrained: true
pretrained_model_name_or_path: manojpreveen/mpt-30b-orca-v2
init_device: mixed
config_overrides:
max_seq_len: ${max_seq_len}
attn_config:
attn_impl: triton
# Set this to `true` if using `train_loader.dataset.packing_ratio` below
attn_uses_sequence_id: false
# Tokenizer
tokenizer:
name: manojpreveen/mpt-30b-orca-v2
kwargs:
model_max_length: ${max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: csv
hf_kwargs:
data_dir: ~/mpt/llm-foundry/data/orca_1m_gpt4
preprocessing_fn:
split: train
max_seq_len: ${max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
# # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
packing_ratio: 19.0
shuffle: true
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
# Optimization
scheduler:
name: linear_decay_with_warmup # linear no warmup is HF default which dolly used
t_warmup: 100ba # add some warmup though, seems to help with MPT
alpha_f: 0
optimizer:
# Based on Dolly
name: decoupled_lionw
lr: 2.0e-6
betas:
- 0.9
- 0.999
eps: 1.0e-8
weight_decay: 0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 1ep # 2-3 epochs seems like the sweet spot
eval_interval: 1
# eval_subset_num_batches: -1
# eval_first: true
global_train_batch_size: 8 # somewhere in the 6-8 * numgpus range seems good
# System
seed: ${global_seed}
# device_eval_batch_size: 8
device_train_microbatch_size: 2
# device_train_microbatch_size: auto
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
verbose: false
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}
# loggers:
# wandb: {}
# Checkpoint to local filesystem or remote object store
# save_interval: 5000ba
save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints