|
|
|
"""LLAMA_Fine-Tuning.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1C-kNPOgPiCC9ybxVKhOkWB9ts53APbOb |
|
|
|
# Fine-tune Llama 3 in Google Colab |
|
""" |
|
|
|
""" |
|
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 |
|
!pip install datasets |
|
!pip install --upgrade accelerate peft bitsandbytes transformers trl |
|
""" |
|
|
|
|
|
import os |
|
import torch |
|
from datasets import load_dataset |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
BitsAndBytesConfig, |
|
HfArgumentParser, |
|
TrainingArguments, |
|
pipeline, |
|
logging, |
|
) |
|
from peft import LoraConfig, PeftModel |
|
from trl import SFTTrainer |
|
|
|
|
|
model_name = "unsloth/llama-3-8b-bnb-4bit" |
|
|
|
|
|
token_name = "XXXX" |
|
|
|
|
|
|
|
new_model = "llama3_python_TFG" |
|
|
|
|
|
|
|
|
|
|
|
|
|
lora_r = 64 |
|
|
|
|
|
lora_alpha = 16 |
|
|
|
|
|
lora_dropout = 0.1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
use_4bit = True |
|
|
|
|
|
bnb_4bit_compute_dtype = "float16" |
|
|
|
|
|
bnb_4bit_quant_type = "nf4" |
|
|
|
|
|
use_nested_quant = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
output_dir = "./results" |
|
|
|
|
|
num_train_epochs = 1 |
|
|
|
|
|
fp16 = False |
|
bf16 = False |
|
|
|
|
|
per_device_train_batch_size = 4 |
|
|
|
|
|
per_device_eval_batch_size = 4 |
|
|
|
|
|
gradient_accumulation_steps = 2 |
|
|
|
|
|
gradient_checkpointing = True |
|
|
|
|
|
max_grad_norm = 0.3 |
|
|
|
|
|
learning_rate = 2e-4 |
|
|
|
|
|
weight_decay = 0.001 |
|
|
|
|
|
optim = "paged_adamw_32bit" |
|
|
|
|
|
lr_scheduler_type = "cosine" |
|
|
|
|
|
max_steps = -1 |
|
|
|
|
|
warmup_ratio = 0.03 |
|
|
|
|
|
|
|
group_by_length = True |
|
|
|
|
|
save_steps = 0 |
|
|
|
|
|
logging_steps = 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
max_seq_length = None |
|
|
|
|
|
packing = False |
|
|
|
|
|
device_map = {"": 0} |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding |
|
from datasets import Dataset |
|
|
|
def load_text_file(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return [line.strip() for line in f if line.strip()] |
|
|
|
train_texts = load_text_file('LLAMA_DatosEntrenamiento.txt') |
|
val_texts = load_text_file('LLAMA_DatosValidacion.txt') |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token_name) |
|
|
|
def tokenize_and_encode(texts): |
|
encodings = tokenizer(texts, truncation=True, padding="longest", max_length=512, return_tensors="pt") |
|
encodings['labels'] = encodings['input_ids'].clone() |
|
return encodings |
|
|
|
train_encodings = tokenize_and_encode(train_texts) |
|
val_encodings = tokenize_and_encode(val_texts) |
|
|
|
train_dataset = Dataset.from_dict({key: val.numpy() for key, val in train_encodings.items()}) |
|
val_dataset = Dataset.from_dict({key: val.numpy() for key, val in val_encodings.items()}) |
|
|
|
training_arguments = TrainingArguments( |
|
output_dir=output_dir, |
|
evaluation_strategy="steps", |
|
eval_steps=500, |
|
num_train_epochs=1, |
|
per_device_train_batch_size=4, |
|
logging_steps=logging_steps, |
|
save_steps=1000, |
|
learning_rate=2e-4, |
|
weight_decay=0.001, |
|
lr_scheduler_type="cosine", |
|
warmup_ratio=0.03, |
|
report_to="tensorboard", |
|
fp16=False |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, token=token_name) |
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_arguments, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset, |
|
data_collator=data_collator |
|
) |
|
|
|
trainer.train() |
|
|
|
model.save_pretrained(new_model) |
|
|
|
model.push_to_hub("eibeel/llama3-python-TFG") |
|
|
|
|
|
|
|
|
|
|