max_seq_len: 8192 global_seed: 17 # Run Name run_name: mpt-30b-4ep # If left blank, will be read from env var $RUN_NAME model: name: hf_causal_lm pretrained: true pretrained_model_name_or_path: manojpreveen/mpt-30b-v4 init_device: mixed config_overrides: max_seq_len: ${max_seq_len} attn_config: attn_impl: triton # Set this to `true` if using `train_loader.dataset.packing_ratio` below attn_uses_sequence_id: false # Tokenizer tokenizer: name: manojpreveen/mpt-30b-v4 kwargs: model_max_length: ${max_seq_len} # Dataloaders train_loader: name: finetuning dataset: hf_name: csv hf_kwargs: data_dir: ~/mpt/llm-foundry/data/orca_1m_gpt4 preprocessing_fn: split: train max_seq_len: ${max_seq_len} allow_pad_trimming: false decoder_only_format: true # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length packing_ratio: 19.0 shuffle: true drop_last: true num_workers: 8 pin_memory: false prefetch_factor: 2 persistent_workers: true timeout: 0 # Optimization scheduler: name: linear_decay_with_warmup # linear no warmup is HF default which dolly used t_warmup: 100ba # add some warmup though, seems to help with MPT alpha_f: 0 optimizer: # Based on Dolly name: decoupled_lionw lr: 1.0e-6 betas: - 0.9 - 0.999 eps: 1.0e-8 weight_decay: 0 algorithms: gradient_clipping: clipping_type: norm clipping_threshold: 1.0 max_duration: 4ep # 2-3 epochs seems like the sweet spot eval_interval: 1 # eval_subset_num_batches: -1 # eval_first: true global_train_batch_size: 8 # somewhere in the 6-8 * numgpus range seems good # System seed: ${global_seed} # device_eval_batch_size: 8 device_train_microbatch_size: 2 # device_train_microbatch_size: auto precision: amp_bf16 # FSDP fsdp_config: sharding_strategy: FULL_SHARD mixed_precision: PURE activation_checkpointing: true activation_checkpointing_reentrant: false activation_cpu_offload: false limit_all_gathers: true verbose: false # Logging progress_bar: false log_to_console: true console_log_interval: 1ba callbacks: speed_monitor: window_size: 10 lr_monitor: {} memory_monitor: {} runtime_estimator: {} # loggers: # wandb: {} # Checkpoint to local filesystem or remote object store save_interval: 1ep save_num_checkpoints_to_keep: 4 # Important, this cleans up checkpoints saved to DISK save_folder: ./{run_name}/checkpoints # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints