In [2]:
!pip install datasets transformers datasets accelerate bitsandbytes peft


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

# Load tokenizer and model with quantization for reduced memory usage
tokenizer = AutoTokenizer.from_pretrained("ContactDoctor/Bio-Medical-Llama-3-8B")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
    bnb_4bit_quant_type="nf4", # Quantization type
    bnb_4bit_use_double_quant=True, # Double quantization
)
model = AutoModelForCausalLM.from_pretrained(
    "ContactDoctor/Bio-Medical-Llama-3-8B",
    quantization_config=quantization_config
)

# **PEFT Configuration:**
# Define the LoRA configuration for the adapter
# Changed target_modules to 'q_proj', 'k_proj', 'v_proj' based on the model architecture
peft_config = LoraConfig(
    r=8,  # Rank of the LoRA update matrices
    lora_alpha=32,  # Scaling factor for the LoRA update matrices
    lora_dropout=0.1,  # Dropout probability for the LoRA layers
    target_modules=["q_proj", "k_proj", "v_proj"], # Specify the target modules for applying LoRA
    bias="none",  # Bias type for the LoRA layers
    task_type="CAUSAL_LM", # Task type for fine-tuning
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # Print the number of trainable parameters

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,718,592 || all params: 8,034,979,840 || trainable%: 0.0587


In [4]:
df = pd.read_csv("/content/medquad.csv")
dataset = Dataset.from_pandas(df)

def preprocess_function(examples):
    question = examples["question"] if examples["question"] is not None else ""
    answer = examples["answer"] if examples["answer"] is not None else ""

    examples["text"] = "<s> " + question + " </s> " + answer + " </s>"
    return examples

dataset = dataset.map(preprocess_function)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.map(lambda examples: {'labels': examples['input_ids']}, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Reduced batch size from 16 to 4
    gradient_accumulation_steps=4,  # Increased gradient accumulation steps from 2 to 4
    num_train_epochs=1,
    fp16=True,
    logging_dir="./logs",
    learning_rate=2e-5,
    save_steps=100,
    dataloader_num_workers=8, # Utilize more CPU cores for data loading
    remove_unused_columns=True, # Remove unused columns from the dataset
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Start training
trainer.train()


Map:   0%|          | 0/16412 [00:00<?, ? examples/s]

Map:   0%|          | 0/16412 [00:00<?, ? examples/s]

Map:   0%|          | 0/16412 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
500,0.6311
1000,0.5396


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into 

TrainOutput(global_step=1025, training_loss=0.583683032524295, metrics={'train_runtime': 2589.4824, 'train_samples_per_second': 6.338, 'train_steps_per_second': 0.396, 'total_flos': 3.783418353549312e+17, 'train_loss': 0.583683032524295, 'epoch': 0.9992688276870583})

In [5]:
import os


model_folder = "/content/drive/MyDrive/Bio-Medical-Llama-3-8B-finetuned"


os.makedirs(model_folder, exist_ok=True)


trainer.save_model(os.path.join(model_folder, "model"))
tokenizer.save_pretrained(os.path.join(model_folder, "tokenizer"))

('/content/drive/MyDrive/Bio-Medical-Llama-3-8B-finetuned/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Bio-Medical-Llama-3-8B-finetuned/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Bio-Medical-Llama-3-8B-finetuned/tokenizer/tokenizer.json')