File size: 3,123 Bytes
17db2ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
CONTEXT_WINDOW = 1024 #has to fit in 4090
HF_TOKEN = os.getenv("HF_TOKEN")
from transformers import (
AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
Trainer, DataCollatorForLanguageModeling
)
import torch
from datasets import load_dataset
from huggingface_hub import login
# setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-1.2B-instruct", token=HF_TOKEN)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left" # better for inference
# init model with auto device mapping
model = AutoModelForCausalLM.from_pretrained(
"Zyphra/Zamba2-1.2B-instruct",
torch_dtype=torch.bfloat16,
device_map="auto" # handles multi-gpu/cpu mapping
)
model.config.pad_token_id = tokenizer.pad_token_id
# Load the Dutch Dolly dataset
dataset = load_dataset("BramVanroy/dolly-15k-dutch", split="train_sft")
def prepare_chat_format(examples):
chats = []
for messages in examples['messages']:
try:
chat = tokenizer.apply_chat_template(
messages,
tokenize=True,
max_length=CONTEXT_WINDOW,
truncation=True,
return_tensors=None
)
except Exception as e:
print(f"Error applying chat template: {e}")
# Fallback format if chat template fails
text = ""
for message in messages:
role = message["role"]
content = message["content"]
text += f"<|{role}|>\n{content}</s>\n"
chat = tokenizer(
text,
max_length=CONTEXT_WINDOW,
truncation=True,
return_tensors=None
)["input_ids"]
chats.append(chat)
return {"input_ids": chats}
# Process the dataset
tokenized_dataset = dataset.map(
prepare_chat_format,
batched=True,
remove_columns=dataset.column_names
)
# training config
training_args = TrainingArguments(
output_dir="./zamba2-finetuned",
num_train_epochs=2,
per_device_train_batch_size=4,
save_steps=500,
save_total_limit=2,
logging_steps=100,
learning_rate=2e-5,
weight_decay=0.01,
fp16=False,
bf16=True,
gradient_accumulation_steps=8,
dataloader_num_workers=4,
gradient_checkpointing=True,
max_grad_norm=1.0,
warmup_steps=100
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# custom trainer to handle device mapping
class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = model
def _move_model_to_device(self, model, device):
pass # model already mapped to devices
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator
)
# Add explicit training and saving steps
trainer.train()
model.save_pretrained("./zamba2-finetuned-final")
tokenizer.save_pretrained("./zamba2-finetuned-final") |