In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = os.getcwd()
print(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False)
model = AutoModelForCausalLM.from_pretrained(model_path, use_safetensors=True, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
#inputs = tokenizer('', return_tensors="pt")
#outputs = model.generate(inputs['input_ids'], max_new_tokens=20, temperature=0)
#print(tokenizer.decode(outputs[0], skip_special_tokens=True))

tokenizer

LlamaTokenizerFast(name_or_path='/var/home/ngxson/jupyter/stories-15M', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': ''}, clean_up_tokenization_spaces=False), added_tokens_decoder={
	0: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [None]:
model.gradient_checkpointing_enable()

In [None]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
 r=64,
 lora_alpha=128,
 target_modules=[
 "q_proj",
 "k_proj",
 "v_proj",
 "o_proj",
 "w1",
 "w2",
 "w3",
 "lm_head",
 ],
 bias="none",
 lora_dropout=0.05, # Conventional
 task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

#print(model)

In [None]:
def split_and_trim(text):
 paragraphs = text.strip().split('\n\n')
 trimmed_paragraphs = []
 for para in paragraphs:
 trimmed_lines = [line.lstrip() for line in para.split('\n')]
 trimmed_paragraphs.append('\n'.join(trimmed_lines))

 return trimmed_paragraphs

with open("data.txt", "r") as f:
 content = f.read()
 dataset = split_and_trim(content)
 tokenized_train_dataset = [
 tokenizer(content)['input_ids'] for content in dataset
 ]
#tokenized_train_dataset

In [None]:
import transformers
from datetime import datetime

project = "moe_shakespeare15M"
run_name = project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

checkpointing_args = {"use_reentrant": False}
trainer = transformers.Trainer(
 model=model,
 train_dataset=tokenized_train_dataset,
 args=transformers.TrainingArguments(
 output_dir=output_dir,
 warmup_steps=100,
 per_device_train_batch_size=50,
 gradient_accumulation_steps=5,
 gradient_checkpointing=True,
 max_steps=500,
 learning_rate=2.5e-5, # Want a small lr for finetuning
 # fp16=True, 
 optim="adamw_torch",
 save_strategy="steps",
 save_steps=100,
 logging_steps=20,
 save_total_limit=4,
 report_to="none", 
 run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
 ),
 data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()