### Optional: install the necessary packages

In [1]:
!git config --global credential.helper store
%pip install huggingface_hub
%pip install -U datasets
%pip install -U bitsandbytes
%pip install -q git+https://github.com/huggingface/transformers.git
%pip install -q accelerate datasets peft torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Login on Hugging Face

In [2]:
from huggingface_hub import login
import os

HF_TOKEN = "hf_C…………"

if os.environ.get('HF_TOKEN') is not None:
  HF_TOKEN = os.environ.get('HF_TOKEN')
  print(f"Hugging Face token found in environment variable")
try:
  import google.colab
  from google.colab import userdata
  if (userdata.get('HF_TOKEN') is not None) and (HF_TOKEN == ""):
    HF_TOKEN = userdata.get('HF_TOKEN')
  else:
    raise ValueError("Please set your Hugging Face token in the user data panel, or pass it as an environment variable")
except ModuleNotFoundError:
  if HF_TOKEN is None:
    raise ValueError("Please set your Hugging Face token in the user data panel, or pass it as an environment variable")

login(
  token=HF_TOKEN,
  add_to_git_credential=True
)

Hugging Face token found in environment variable


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Set the environment variables

In [3]:
import os
#source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
source_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
destination_model_id = "eltorio/IDEFICS3_medical_instruct"
dataset_id = "ruslanmv/ai-medical-dataset"
prompt= "You are a medical doctor with 15 year of experience verifying the knowledge of a new diploma medical doctor"
output_dir = "IDEFICS3_medical_instruct"
os.environ["output_dir"] = output_dir
os.environ["source_model_id"] = source_model_id
os.environ["destination_model_id"] = destination_model_id
os.environ["dataset_id"] = dataset_id
os.environ["prompt"] = prompt

### Optionally clone the model repository

In [4]:
# clone Hugging Face model repository
!echo $destination_model_id
!git clone https://huggingface.co/$destination_model_id $output_dir

eltorio/IDEFICS3_medical_instruct
fatal: destination path 'IDEFICS3_medical_instruct' already exists and is not an empty directory.


### Load the dataset

In [5]:
from datasets import load_dataset

base_dataset = load_dataset("ruslanmv/ai-medical-dataset")
# define the train dataset as a random 80% of the data
train_dataset = base_dataset["train"].train_test_split(test_size=0.2)["train"]
# define the eval dataset as the remaining 20%
eval_dataset = base_dataset["train"].train_test_split(test_size=0.2)["test"]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

### Configure LoRA adapters

In [6]:
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoProcessor, MllamaForConditionalGeneration, BitsAndBytesConfig, Idefics3ForConditionalGeneration

DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True

processor = AutoProcessor.from_pretrained(
    source_model_id,
    do_image_splitting=False
)

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
        #target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
        task_type="CAUSAL_LM",
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    model = MllamaForConditionalGeneration.from_pretrained(
        source_model_id,
        torch_dtype=torch.float16,
        quantization_config=bnb_config if USE_QLORA else None,
    )
    model = get_peft_model(model, lora_config)
    #model.add_adapter(lora_config)
    #model.enable_adapters()
else:
    model = MllamaForConditionalGeneration.from_pretrained(
        source_model_id,
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2", # This works for A100 or H100
    ).to(DEVICE)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
eval_dataset
eval_dataset[24]

{'question': 'What is the percentage of adult malignancies?',
 'context': 'Renal cell carcinoma accounts for about 3% of adult malignancies and 85% of neoplasms arising from the kidney. To identify potential progression markers for kidney cancer we examined non-neoplastic and neoplastic kidney tissue from three groups of patients, which represent different tumor stages (pT1, pT2, pT3) by a fluorescence two-dimensional difference gel electrophoresis (2D-DIGE) approach combined with MALDI-ToF-MS/MS. Delta2D software package was used for gel image based quantification and statistical analysis. Thereby, a comprehensive Principal Component Analysis (PCA) could be performed and allowed a robust quality control of the experiment as well as a classification of the analyzed samples, which correlated with the predicted stages from the pathological examination. Additionally for selected candidate proteins we detected a correlation to the tumor grading as revealed by immunohistochemistry. On the 2

### Create Data Collator for IDEFICS3 format.

In [8]:
class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = 128256

    def __call__(self, samples):
        texts = []
        images = []
        for sample in samples:
            question = sample["question"]
            answer = sample["context"]
            messages = [
                {
                    "role": "system",
                    "content": [
                        {"type": "text", "text": prompt}
                    ]

                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question },
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())

        batch = processor(text=texts, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        #labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch

data_collator = MyDataCollator(processor)

### Setup training parameters

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = output_dir,
    overwrite_output_dir = False,
    auto_find_batch_size = True,
    learning_rate = 2e-4,
    fp16 = True,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 8,
    dataloader_pin_memory = False,
    save_total_limit = 3,
    eval_strategy = "steps",
    save_strategy = "steps",
    eval_steps = 100,
    save_steps = 10, # checkpoint each 10 steps
    resume_from_checkpoint = True,
    logging_steps = 5,
    remove_unused_columns = False,
    push_to_hub = True,
    label_names = ["labels"],
    load_best_model_at_end = False,
    report_to = "none",
    optim = "paged_adamw_8bit",
    max_steps = 10, # remove this for training
)

In [10]:
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


### Start (or restart) Training

In [None]:
trainer.train()