|
max_seq_len: 8192 |
|
global_seed: 17 |
|
|
|
|
|
run_name: mpt-30b-orca-1ep |
|
|
|
model: |
|
name: hf_causal_lm |
|
pretrained: true |
|
pretrained_model_name_or_path: manojpreveen/mpt-30b-orca-v2 |
|
init_device: mixed |
|
config_overrides: |
|
max_seq_len: ${max_seq_len} |
|
attn_config: |
|
attn_impl: triton |
|
|
|
attn_uses_sequence_id: false |
|
|
|
|
|
tokenizer: |
|
name: manojpreveen/mpt-30b-orca-v2 |
|
kwargs: |
|
model_max_length: ${max_seq_len} |
|
|
|
|
|
|
|
train_loader: |
|
name: finetuning |
|
dataset: |
|
hf_name: csv |
|
hf_kwargs: |
|
data_dir: ~/mpt/llm-foundry/data/orca_1m_gpt4 |
|
preprocessing_fn: |
|
split: train |
|
max_seq_len: ${max_seq_len} |
|
allow_pad_trimming: false |
|
decoder_only_format: true |
|
|
|
|
|
|
|
packing_ratio: 19.0 |
|
shuffle: true |
|
drop_last: true |
|
num_workers: 8 |
|
pin_memory: false |
|
prefetch_factor: 2 |
|
persistent_workers: true |
|
timeout: 0 |
|
|
|
|
|
scheduler: |
|
name: linear_decay_with_warmup |
|
t_warmup: 100ba |
|
alpha_f: 0 |
|
|
|
optimizer: |
|
|
|
name: decoupled_lionw |
|
lr: 2.0e-6 |
|
betas: |
|
- 0.9 |
|
- 0.999 |
|
eps: 1.0e-8 |
|
weight_decay: 0 |
|
|
|
algorithms: |
|
gradient_clipping: |
|
clipping_type: norm |
|
clipping_threshold: 1.0 |
|
|
|
max_duration: 1ep |
|
eval_interval: 1 |
|
|
|
|
|
global_train_batch_size: 8 |
|
|
|
|
|
seed: ${global_seed} |
|
|
|
device_train_microbatch_size: 2 |
|
|
|
precision: amp_bf16 |
|
|
|
|
|
fsdp_config: |
|
sharding_strategy: FULL_SHARD |
|
mixed_precision: PURE |
|
activation_checkpointing: true |
|
activation_checkpointing_reentrant: false |
|
activation_cpu_offload: false |
|
limit_all_gathers: true |
|
verbose: false |
|
|
|
|
|
progress_bar: false |
|
log_to_console: true |
|
console_log_interval: 1ba |
|
|
|
callbacks: |
|
speed_monitor: |
|
window_size: 10 |
|
lr_monitor: {} |
|
memory_monitor: {} |
|
runtime_estimator: {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
save_num_checkpoints_to_keep: 1 |
|
save_folder: ./{run_name}/checkpoints |
|
|
|
|