File size: 3,123 Bytes
17db2ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os

CONTEXT_WINDOW = 1024 #has to fit in 4090
HF_TOKEN = os.getenv("HF_TOKEN")

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
    Trainer, DataCollatorForLanguageModeling
)
import torch
from datasets import load_dataset
from huggingface_hub import login

# setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-1.2B-instruct", token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # better for inference

# init model with auto device mapping
model = AutoModelForCausalLM.from_pretrained(
    "Zyphra/Zamba2-1.2B-instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"  # handles multi-gpu/cpu mapping
)
model.config.pad_token_id = tokenizer.pad_token_id

# Load the Dutch Dolly dataset
dataset = load_dataset("BramVanroy/dolly-15k-dutch", split="train_sft")

def prepare_chat_format(examples):
    chats = []
    for messages in examples['messages']:
        try:
            chat = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                max_length=CONTEXT_WINDOW,
                truncation=True,
                return_tensors=None
            )
        except Exception as e:
            print(f"Error applying chat template: {e}")
            # Fallback format if chat template fails
            text = ""
            for message in messages:
                role = message["role"]
                content = message["content"]
                text += f"<|{role}|>\n{content}</s>\n"
            
            chat = tokenizer(
                text,
                max_length=CONTEXT_WINDOW,
                truncation=True,
                return_tensors=None
            )["input_ids"]
            
        chats.append(chat)
    return {"input_ids": chats}

# Process the dataset
tokenized_dataset = dataset.map(
    prepare_chat_format,
    batched=True,
    remove_columns=dataset.column_names
)

# training config
training_args = TrainingArguments(
    output_dir="./zamba2-finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    gradient_accumulation_steps=8,
    dataloader_num_workers=4,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    warmup_steps=100
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# custom trainer to handle device mapping
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = model
        
    def _move_model_to_device(self, model, device):
        pass  # model already mapped to devices

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Add explicit training and saving steps
trainer.train()
model.save_pretrained("./zamba2-finetuned-final")
tokenizer.save_pretrained("./zamba2-finetuned-final")