import sys import logging import datasets from datasets import load_dataset from peft import LoraConfig import torch import transformers from trl import SFTTrainer from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig """ A simple example on using SFTTrainer and Accelerate to finetune Phi-3 models. For a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py. This example has utilized DeepSpeed ZeRO3 offload to reduce the memory usage. The script can be run on V100 or later generation GPUs. Here are some suggestions on futher reducing memory consumption: - reduce batch size - decrease lora dimension - restrict lora target modules Please follow these steps to run the script: 1. Install dependencies: conda install -c conda-forge accelerate pip3 install -i https://pypi.org/simple/ bitsandbytes pip3 install peft transformers trl datasets pip3 install deepspeed 2. Setup accelerate and deepspeed config based on the machine used: accelerate config Here is a sample config for deepspeed zero3: compute_environment: LOCAL_MACHINE debug: false deepspeed_config: gradient_accumulation_steps: 1 offload_optimizer_device: none offload_param_device: none zero3_init_flag: true zero3_save_16bit_model: true zero_stage: 3 distributed_type: DEEPSPEED downcast_bf16: 'no' enable_cpu_affinity: false machine_rank: 0 main_training_function: main mixed_precision: bf16 num_machines: 1 num_processes: 4 rdzv_backend: static same_network: true tpu_env: [] tpu_use_cluster: false tpu_use_sudo: false use_cpu: false 3. check accelerate config: accelerate env 4. Run the code: accelerate launch sample_finetune.py """ logger = logging.getLogger(__name__) ################### # Hyper-parameters ################### training_config = { "bf16": True, "do_eval": False, "learning_rate": 5.0e-06, "log_level": "info", "logging_steps": 20, "logging_strategy": "steps", "lr_scheduler_type": "cosine", "num_train_epochs": 1, "max_steps": -1, "output_dir": "./checkpoint_dir", "overwrite_output_dir": True, "per_device_eval_batch_size": 4, "per_device_train_batch_size": 4, "remove_unused_columns": True, "save_steps": 100, "save_total_limit": 1, "seed": 0, "gradient_checkpointing": True, "gradient_checkpointing_kwargs":{"use_reentrant": False}, "gradient_accumulation_steps": 1, "warmup_ratio": 0.2, } peft_config = { "r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "bias": "none", "task_type": "CAUSAL_LM", "target_modules": "all-linear", "modules_to_save": None, } train_conf = TrainingArguments(**training_config) peft_conf = LoraConfig(**peft_config) ############### # Setup logging ############### logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = train_conf.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process a small summary logger.warning( f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}" + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}" ) logger.info(f"Training/evaluation parameters {train_conf}") logger.info(f"PEFT parameters {peft_conf}") ################ # Model Loading ################ # checkpoint_path = "microsoft/Phi-3-mini-4k-instruct" checkpoint_path = "microsoft/Phi-3-mini-128k-instruct" model_kwargs = dict( use_cache=False, trust_remote_code=True, attn_implementation="flash_attention_2", # loading the model with flash-attenstion support torch_dtype=torch.bfloat16, device_map=None ) model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) tokenizer.model_max_length = 2048 tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) tokenizer.padding_side = 'right' ################## # Data Processing ################## def apply_chat_template( example, tokenizer, ): messages = example["messages"] example["text"] = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False) return example raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k") train_dataset = raw_dataset["train_sft"] test_dataset = raw_dataset["test_sft"] column_names = list(train_dataset.features) processed_train_dataset = train_dataset.map( apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=10, remove_columns=column_names, desc="Applying chat template to train_sft", ) processed_test_dataset = test_dataset.map( apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=10, remove_columns=column_names, desc="Applying chat template to test_sft", ) ########### # Training ########### trainer = SFTTrainer( model=model, args=train_conf, peft_config=peft_conf, train_dataset=processed_train_dataset, eval_dataset=processed_test_dataset, max_seq_length=2048, dataset_text_field="text", tokenizer=tokenizer, packing=True ) train_result = trainer.train() metrics = train_result.metrics trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() ############# # Evaluation ############# tokenizer.padding_side = 'left' metrics = trainer.evaluate() metrics["eval_samples"] = len(processed_test_dataset) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # ############ # # Save model # ############ trainer.save_model(train_conf.output_dir)