Spaces:

Hunzla
/

fine-tuning

Runtime error

App Files Files Community

fine-tuning / app.py

Hunzla

Create app.py

8fd601d about 1 year ago

raw

history blame

6.78 kB

	import os
	import torch
	from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	HfArgumentParser,
	TrainingArguments,
	pipeline,
	logging,
	)
	from peft import LoraConfig, PeftModel
	from trl import SFTTrainer
	# The model that you want to train from the Hugging Face hub
	model_name = "meta-llama/Llama-2-7b-chat-hf"

	# The instruction dataset to use
	dataset_name = "mlabonne/guanaco-llama2-1k"

	# Fine-tuned model name
	new_model = "llama-2-7b-miniguanaco"

	################################################################################
	# QLoRA parameters
	################################################################################

	# LoRA attention dimension
	lora_r = 64

	# Alpha parameter for LoRA scaling
	lora_alpha = 16

	# Dropout probability for LoRA layers
	lora_dropout = 0.1

	################################################################################
	# bitsandbytes parameters
	################################################################################

	# Activate 4-bit precision base model loading
	use_4bit = True

	# Compute dtype for 4-bit base models
	bnb_4bit_compute_dtype = "float16"

	# Quantization type (fp4 or nf4)
	bnb_4bit_quant_type = "nf4"

	# Activate nested quantization for 4-bit base models (double quantization)
	use_nested_quant = False

	################################################################################
	# TrainingArguments parameters
	################################################################################

	# Output directory where the model predictions and checkpoints will be stored
	output_dir = "./results"

	# Number of training epochs
	num_train_epochs = 1

	# Enable fp16/bf16 training (set bf16 to True with an A100)
	fp16 = False
	bf16 = False

	# Batch size per GPU for training
	per_device_train_batch_size = 4

	# Batch size per GPU for evaluation
	per_device_eval_batch_size = 4

	# Number of update steps to accumulate the gradients for
	gradient_accumulation_steps = 1

	# Enable gradient checkpointing
	gradient_checkpointing = True

	# Maximum gradient normal (gradient clipping)
	max_grad_norm = 0.3

	# Initial learning rate (AdamW optimizer)
	learning_rate = 2e-4

	# Weight decay to apply to all layers except bias/LayerNorm weights
	weight_decay = 0.001

	# Optimizer to use
	optim = "paged_adamw_32bit"

	# Learning rate schedule (constant a bit better than cosine)
	lr_scheduler_type = "constant"

	# Number of training steps (overrides num_train_epochs)
	max_steps = -1

	# Ratio of steps for a linear warmup (from 0 to learning rate)
	warmup_ratio = 0.03

	# Group sequences into batches with same length
	# Saves memory and speeds up training considerably
	group_by_length = True

	# Save checkpoint every X updates steps
	save_steps = 25

	# Log every X updates steps
	logging_steps = 25

	################################################################################
	# SFT parameters
	################################################################################

	# Maximum sequence length to use
	max_seq_length = None

	# Pack multiple short examples in the same input sequence to increase efficiency
	packing = False

	# Load the entire model on the GPU 0
	device_map = {"": 0}
	# Load dataset (you can process it here)
	dataset = load_dataset(dataset_name, split="train")

	# Load tokenizer and model with QLoRA configuration
	compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=use_4bit,
	bnb_4bit_quant_type=bnb_4bit_quant_type,
	bnb_4bit_compute_dtype=compute_dtype,
	bnb_4bit_use_double_quant=use_nested_quant,
	)

	# Check GPU compatibility with bfloat16
	if compute_dtype == torch.float16 and use_4bit:
	major, _ = torch.cuda.get_device_capability()
	if major >= 8:
	print("=" * 80)
	print("Your GPU supports bfloat16: accelerate training with bf16=True")
	print("=" * 80)

	# Load base model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=bnb_config,
	device_map=device_map
	)
	model.config.use_cache = False
	model.config.pretraining_tp = 1

	# Load LLaMA tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

	# Load LoRA configuration
	peft_config = LoraConfig(
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	r=lora_r,
	bias="none",
	task_type="CAUSAL_LM",
	)

	# Set training parameters
	training_arguments = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=num_train_epochs,
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	optim=optim,
	save_steps=save_steps,
	logging_steps=logging_steps,
	learning_rate=learning_rate,
	weight_decay=weight_decay,
	fp16=fp16,
	bf16=bf16,
	max_grad_norm=max_grad_norm,
	max_steps=max_steps,
	warmup_ratio=warmup_ratio,
	group_by_length=group_by_length,
	lr_scheduler_type=lr_scheduler_type,
	report_to="tensorboard"
	)

	# Set supervised fine-tuning parameters
	trainer = SFTTrainer(
	model=model,
	train_dataset=dataset,
	peft_config=peft_config,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	tokenizer=tokenizer,
	args=training_arguments,
	packing=packing,
	)

	# Train model
	trainer.train()

	# Save trained model
	trainer.model.save_pretrained(new_model)
	# Ignore warnings
	logging.set_verbosity(logging.CRITICAL)

	# Run text generation pipeline with our next model
	prompt = "What is a large language model?"
	pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	result = pipe(f"<s>[INST] {prompt} [/INST]")
	print(result[0]['generated_text'])
	# Reload model in FP16 and merge it with LoRA weights
	base_model = AutoModelForCausalLM.from_pretrained(
	model_name,
	low_cpu_mem_usage=True,
	return_dict=True,
	torch_dtype=torch.float16,
	device_map=device_map,
	)
	model = PeftModel.from_pretrained(base_model, new_model)
	model = model.merge_and_unload()

	# Reload tokenizer to save it
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"
	kwargs={

	}
	model.push_to_hub(**kwargs)
	tokenizer.push_to_hub(new_model, use_temp_dir=False)
	def do_nothing(text):
	return text
	# Create Gradio interface
	interface = gr.Interface(
	fn=do_nothing,
	inputs="text",
	outputs="text",
	layout="vertical",
	title="LLAMA-2-7B Chatbot",
	description="Enter a prompt and get a chatbot response.",
	examples=[["Tell me a joke."]],
	)

	if __name__ == "__main__":
	interface.launch()