import logging import os import random from typing import Any import numpy as np import pandas as pd from pytorch_lightning import Trainer, LightningModule, LightningDataModule from pytorch_lightning.utilities.types import OptimizerLRScheduler, STEP_OUTPUT, EVAL_DATALOADERS, TRAIN_DATALOADERS from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, Dataset from torchmetrics.classification import BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinaryPrecision, BinaryRecall from transformers import BertModel, BatchEncoding, BertTokenizer, TrainingArguments, AutoModelForSequenceClassification from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions import torch from torch import nn from datasets import load_dataset, IterableDataset from huggingface_hub import PyTorchModelHubMixin from dotenv import load_dotenv from huggingface_hub import login timber = logging.getLogger() # logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.INFO) # change to level=logging.DEBUG to print more logs... black = "\u001b[30m" red = "\u001b[31m" green = "\u001b[32m" yellow = "\u001b[33m" blue = "\u001b[34m" magenta = "\u001b[35m" cyan = "\u001b[36m" white = "\u001b[37m" FORWARD = "FORWARD_INPUT" BACKWARD = "BACKWARD_INPUT" DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") PRETRAINED_MODEL_NAME: str = "LongSafari/hyenadna-small-32k-seqlen-hf" def login_inside_huggingface_virtualmachine(): # Load the .env file, but don't crash if it's not found (e.g., in Hugging Face Space) try: load_dotenv() # Only useful on your laptop if .env exists print(".env file loaded successfully.") except Exception as e: print(f"Warning: Could not load .env file. Exception: {e}") # Try to get the token from environment variables try: token = os.getenv("HF_TOKEN") if not token: raise ValueError("HF_TOKEN not found. Make sure to set it in the environment variables or .env file.") # Log in to Hugging Face Hub login(token) print("Logged in to Hugging Face Hub successfully.") except Exception as e: print(f"Error during Hugging Face login: {e}") # Handle the error appropriately (e.g., exit or retry) def one_hot_e(dna_seq: str) -> np.ndarray: mydict = {'A': np.asarray([1.0, 0.0, 0.0, 0.0]), 'C': np.asarray([0.0, 1.0, 0.0, 0.0]), 'G': np.asarray([0.0, 0.0, 1.0, 0.0]), 'T': np.asarray([0.0, 0.0, 0.0, 1.0]), 'N': np.asarray([0.0, 0.0, 0.0, 0.0]), 'H': np.asarray([0.0, 0.0, 0.0, 0.0]), 'a': np.asarray([1.0, 0.0, 0.0, 0.0]), 'c': np.asarray([0.0, 1.0, 0.0, 0.0]), 'g': np.asarray([0.0, 0.0, 1.0, 0.0]), 't': np.asarray([0.0, 0.0, 0.0, 1.0]), 'n': np.asarray([0.0, 0.0, 0.0, 0.0]), '-': np.asarray([0.0, 0.0, 0.0, 0.0])} size_of_a_seq: int = len(dna_seq) # forward = np.zeros(shape=(size_of_a_seq, 4)) forward_list: list = [mydict[dna_seq[i]] for i in range(0, size_of_a_seq)] encoded = np.asarray(forward_list) encoded_transposed = encoded.transpose() # todo: Needs review return encoded_transposed def one_hot_e_column(column: pd.Series) -> np.ndarray: tmp_list: list = [one_hot_e(seq) for seq in column] encoded_column = np.asarray(tmp_list).astype(np.float32) return encoded_column def reverse_dna_seq(dna_seq: str) -> str: # m_reversed = "" # for i in range(0, len(dna_seq)): # m_reversed = dna_seq[i] + m_reversed # return m_reversed return dna_seq[::-1] def complement_dna_seq(dna_seq: str) -> str: comp_map = {"A": "T", "C": "G", "T": "A", "G": "C", "a": "t", "c": "g", "t": "a", "g": "c", "N": "N", "H": "H", "-": "-", "n": "n", "h": "h" } comp_dna_seq_list: list = [comp_map[nucleotide] for nucleotide in dna_seq] comp_dna_seq: str = "".join(comp_dna_seq_list) return comp_dna_seq def reverse_complement_dna_seq(dna_seq: str) -> str: return reverse_dna_seq(complement_dna_seq(dna_seq)) def reverse_complement_column(column: pd.Series) -> np.ndarray: rc_column: list = [reverse_complement_dna_seq(seq) for seq in column] return rc_column class TorchMetrics: def __init__(self, device=DEVICE): self.binary_accuracy = BinaryAccuracy().to(device) self.binary_auc = BinaryAUROC().to(device) self.binary_f1_score = BinaryF1Score().to(device) self.binary_precision = BinaryPrecision().to(device) self.binary_recall = BinaryRecall().to(device) pass def update_on_each_step(self, batch_predicted_labels, batch_actual_labels): # todo: Add log if needed self.binary_accuracy.update(preds=batch_predicted_labels, target=batch_actual_labels) self.binary_auc.update(preds=batch_predicted_labels, target=batch_actual_labels) self.binary_f1_score.update(preds=batch_predicted_labels, target=batch_actual_labels) self.binary_precision.update(preds=batch_predicted_labels, target=batch_actual_labels) self.binary_recall.update(preds=batch_predicted_labels, target=batch_actual_labels) pass def compute_and_reset_on_epoch_end(self, log, log_prefix: str, log_color: str = green): b_accuracy = self.binary_accuracy.compute() b_auc = self.binary_auc.compute() b_f1_score = self.binary_f1_score.compute() b_precision = self.binary_precision.compute() b_recall = self.binary_recall.compute() timber.info( log_color + f"{log_prefix}_acc = {b_accuracy}, {log_prefix}_auc = {b_auc}, {log_prefix}_f1_score = {b_f1_score}, {log_prefix}_precision = {b_precision}, {log_prefix}_recall = {b_recall}") log(f"{log_prefix}_accuracy", b_accuracy) log(f"{log_prefix}_auc", b_auc) log(f"{log_prefix}_f1_score", b_f1_score) log(f"{log_prefix}_precision", b_precision) log(f"{log_prefix}_recall", b_recall) self.binary_accuracy.reset() self.binary_auc.reset() self.binary_f1_score.reset() self.binary_precision.reset() self.binary_recall.reset() pass def insert_debug_motif_at_random_position(seq, DEBUG_MOTIF): start = 0 end = len(seq) rand_pos = random.randrange(start, (end - len(DEBUG_MOTIF))) random_end = rand_pos + len(DEBUG_MOTIF) output = seq[start: rand_pos] + DEBUG_MOTIF + seq[random_end: end] assert len(seq) == len(output) return output class PagingMQTLDataset(IterableDataset): def __init__(self, m_dataset, seq_len, tokenizer, max_length=512, check_if_pipeline_is_ok_by_inserting_debug_motif=False): self.dataset = m_dataset self.check_if_pipeline_is_ok_by_inserting_debug_motif = check_if_pipeline_is_ok_by_inserting_debug_motif self.debug_motif = "ATCGCCTA" self.seq_len = seq_len self.bert_tokenizer = tokenizer self.max_length = max_length pass def __iter__(self): for row in self.dataset: processed = self.preprocess(row) if processed is not None: yield processed def preprocess(self, row): sequence = row['sequence'] # Fetch the 'sequence' column if len(sequence) != self.seq_len: return None # skip problematic row! label = row['label'] # Fetch the 'label' column (or whatever target you use) if label == 1 and self.check_if_pipeline_is_ok_by_inserting_debug_motif: sequence = insert_debug_motif_at_random_position(seq=sequence, DEBUG_MOTIF=self.debug_motif) ohe_sequence = one_hot_e(dna_seq=sequence) one_seq_tensor = torch.from_numpy(ohe_sequence).to(torch.int64) # Tokenize the sequence encoded_sequence_tokenized: BatchEncoding = self.bert_tokenizer(one_seq_tensor) input_ids = encoded_sequence_tokenized["input_ids"] # encoded_sequence_squeezed = {key: val.squeeze() for key, val in encoded_sequence.items()} return input_ids, label # def collate_fn(batch): # sequences, labels = zip(*batch) # ohe_seq, ohe_seq_rc = sequences[0], sequences[1] # # Pad sequences to the maximum length in this batch # padded_sequences = pad_sequence(ohe_seq, batch_first=True, padding_value=0) # padded_sequences_rc = pad_sequence(ohe_seq_rc, batch_first=True, padding_value=0) # # Convert labels to a tensor # labels = torch.stack(labels) # return [padded_sequences, padded_sequences_rc], labels class MqtlDataModule(LightningDataModule): def __init__(self, train_ds, val_ds, test_ds, batch_size=16): super().__init__() self.batch_size = batch_size self.train_loader = DataLoader(train_ds, batch_size=self.batch_size, shuffle=False, # collate_fn=collate_fn, num_workers=1, # persistent_workers=True ) self.validate_loader = DataLoader(val_ds, batch_size=self.batch_size, shuffle=False, # collate_fn=collate_fn, num_workers=1, # persistent_workers=True ) self.test_loader = DataLoader(test_ds, batch_size=self.batch_size, shuffle=False, # collate_fn=collate_fn, num_workers=1, # persistent_workers=True ) pass def prepare_data(self): pass def setup(self, stage: str) -> None: timber.info(f"inside setup: {stage = }") pass def train_dataloader(self) -> TRAIN_DATALOADERS: return self.train_loader def val_dataloader(self) -> EVAL_DATALOADERS: return self.validate_loader def test_dataloader(self) -> EVAL_DATALOADERS: return self.test_loader class MQtlBertClassifierLightningModule(LightningModule): def __init__(self, classifier: nn.Module, criterion=None, # nn.BCEWithLogitsLoss(), regularization: int = 2, # 1 == L1, 2 == L2, 3 (== 1 | 2) == both l1 and l2, else ignore / don't care l1_lambda=0.001, l2_wright_decay=0.001, *args: Any, **kwargs: Any): super().__init__(*args, **kwargs) self.classifier = classifier self.criterion = criterion self.train_metrics = TorchMetrics() self.validate_metrics = TorchMetrics() self.test_metrics = TorchMetrics() self.regularization = regularization self.l1_lambda = l1_lambda self.l2_weight_decay = l2_wright_decay pass def forward(self, x, *args: Any, **kwargs: Any) -> Any: input_ids: torch.tensor = x["input_ids"] return self.classifier.forward(input_ids) def configure_optimizers(self) -> OptimizerLRScheduler: # Here we add weight decay (L2 regularization) to the optimizer weight_decay = 0.0 if self.regularization == 2 or self.regularization == 3: weight_decay = self.l2_weight_decay return torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=weight_decay) # , weight_decay=0.005) def training_step(self, batch, batch_idx, *args: Any, **kwargs: Any) -> STEP_OUTPUT: # Accuracy on training batch data x, y = batch preds = self.forward(x) loss = self.criterion(preds, y) if self.regularization == 1 or self.regularization == 3: # apply l1 regularization l1_norm = sum(p.abs().sum() for p in self.parameters()) loss += self.l1_lambda * l1_norm self.log("train_loss", loss) # calculate the scores start self.train_metrics.update_on_each_step(batch_predicted_labels=preds.squeeze(), batch_actual_labels=y) # calculate the scores end return loss def on_train_epoch_end(self) -> None: self.train_metrics.compute_and_reset_on_epoch_end(log=self.log, log_prefix="train") pass def validation_step(self, batch, batch_idx, *args: Any, **kwargs: Any) -> STEP_OUTPUT: # Accuracy on validation batch data # print(f"debug { batch = }") x, y = batch preds = self.forward(x) loss = self.criterion(preds, y) self.log("valid_loss", loss) # calculate the scores start self.validate_metrics.update_on_each_step(batch_predicted_labels=preds.squeeze(), batch_actual_labels=y) # calculate the scores end return loss def on_validation_epoch_end(self) -> None: self.validate_metrics.compute_and_reset_on_epoch_end(log=self.log, log_prefix="validate", log_color=blue) return None def test_step(self, batch, batch_idx, *args: Any, **kwargs: Any) -> STEP_OUTPUT: # Accuracy on validation batch data x, y = batch preds = self.forward(x) loss = self.criterion(preds, y) self.log("test_loss", loss) # do we need this? # calculate the scores start self.test_metrics.update_on_each_step(batch_predicted_labels=preds.squeeze(), batch_actual_labels=y) # calculate the scores end return loss def on_test_epoch_end(self) -> None: self.test_metrics.compute_and_reset_on_epoch_end(log=self.log, log_prefix="test", log_color=magenta) return None pass def start_bert(classifier_model, criterion, m_optimizer=torch.optim.Adam, WINDOW=200, is_binned=True, is_debug=False, max_epochs=10, batch_size=8): file_suffix = "" if is_binned: file_suffix = "_binned" data_files = { # small samples "train_binned_200": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_200_train_binned.csv", "validate_binned_200": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_200_validate_binned.csv", "test_binned_200": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_200_test_binned.csv", # medium samples "train_binned_1000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_1000_train_binned.csv", "validate_binned_1000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_1000_validate_binned.csv", "test_binned_1000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_1000_test_binned.csv", # large samples "train_binned_4000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv", "validate_binned_4000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_validate_binned.csv", "test_binned_4000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_test_binned.csv", } dataset_map = None is_my_laptop = os.path.isfile("/src/inputdata/dataset_4000_test_binned.csv") if is_my_laptop: dataset_map = load_dataset("csv", data_files=data_files, streaming=True) else: dataset_map = load_dataset("fahimfarhan/mqtl-classification-datasets", streaming=True) tokenizer = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=PRETRAINED_MODEL_NAME, trust_remote_code=True) train_dataset = PagingMQTLDataset(dataset_map[f"train_binned_{WINDOW}"], check_if_pipeline_is_ok_by_inserting_debug_motif=is_debug, tokenizer=tokenizer, seq_len=WINDOW ) val_dataset = PagingMQTLDataset(dataset_map[f"validate_binned_{WINDOW}"], check_if_pipeline_is_ok_by_inserting_debug_motif=is_debug, tokenizer=tokenizer, seq_len=WINDOW) test_dataset = PagingMQTLDataset(dataset_map[f"test_binned_{WINDOW}"], check_if_pipeline_is_ok_by_inserting_debug_motif=is_debug, tokenizer=tokenizer, seq_len=WINDOW) data_module = MqtlDataModule(train_ds=train_dataset, val_ds=val_dataset, test_ds=test_dataset, batch_size=batch_size) classifier_model = classifier_model #.to(DEVICE) try: classifier_model = classifier_model.from_pretrained(classifier_model.model_repository_name) except Exception as x: print(x) # classifier_module = MQtlBertClassifierLightningModule( # classifier=classifier_model, # regularization=2, criterion=criterion) # if os.path.exists(model_save_path): # classifier_module.load_state_dict(torch.load(model_save_path)) args = { "output_dir": "tmp", "num_train_epochs": 1, "per_device_train_batch_size": 1, "gradient_accumulation_steps": 4, "gradient_checkpointing": True, "learning_rate": 2e-5, } training_args = TrainingArguments(**args) trainer = Trainer(model=classifier_model, args=training_args, datamodule=data_module, max_epochs=max_epochs, precision="32") trainer.fit(model=classifier_model) timber.info("\n\n") trainer.test(model=classifier_model) timber.info("\n\n") # torch.save(classifier_module.state_dict(), model_save_path) # deprecated, use classifier_model.save_pretrained(model_subdirectory) instead # save locally model_subdirectory = classifier_model.model_repository_name classifier_model.save_pretrained(model_subdirectory) # push to the hub commit_message = f":tada: Push model for window size {WINDOW} from huggingface space" if is_my_laptop: commit_message = f":tada: Push model for window size {WINDOW} from zephyrus" classifier_model.push_to_hub( repo_id=f"fahimfarhan/{classifier_model.model_repository_name}", # subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/ commit_message=commit_message # f":tada: Push model for window size {WINDOW}" ) # reload # classifier_model = classifier_model.from_pretrained(f"fahimfarhan/{classifier_model.model_repository_name}") # classifier_model = classifier_model.from_pretrained(model_subdirectory) pass class CommonAttentionLayer(nn.Module): def __init__(self, hidden_size, *args, **kwargs): super().__init__(*args, **kwargs) self.attention_linear = nn.Linear(hidden_size, 1) pass def forward(self, hidden_states): # Apply linear layer attn_weights = self.attention_linear(hidden_states) # Apply softmax to get attention scores attn_weights = torch.softmax(attn_weights, dim=1) # Apply attention weights to hidden states context_vector = torch.sum(attn_weights * hidden_states, dim=1) return context_vector, attn_weights class ReshapedBCEWithLogitsLoss(nn.BCEWithLogitsLoss): def forward(self, input, target): return super().forward(input.squeeze(), target.float()) class HyenaDnaMQTLClassifier(nn.Module): def __init__(self, seq_len: int, model_repository_name: str, bert_model=BertModel.from_pretrained(pretrained_model_name_or_path=PRETRAINED_MODEL_NAME), hidden_size=768, num_classes=1, *args, **kwargs ): super().__init__(*args, **kwargs) self.seq_len = seq_len self.model_repository_name = model_repository_name self.model_name = "MQtlDnaBERT6Classifier" self.bert_model = bert_model self.attention = CommonAttentionLayer(hidden_size) self.classifier = nn.Linear(hidden_size, num_classes) pass def forward(self, input_ids: torch.tensor): """ # torch.Size([128, 1, 512]) --> [128, 512] input_ids = input_ids.squeeze(dim=1).to(DEVICE) # torch.Size([16, 1, 512]) --> [16, 512] attention_mask = attention_mask.squeeze(dim=1).to(DEVICE) token_type_ids = token_type_ids.squeeze(dim=1).to(DEVICE) """ bert_output: BaseModelOutputWithPoolingAndCrossAttentions = self.bert_model(input_ids=input_ids) last_hidden_state = bert_output.last_hidden_state context_vector, ignore_attention_weight = self.attention(last_hidden_state) y = self.classifier(context_vector) return y if __name__ == '__main__': login_inside_huggingface_virtualmachine() WINDOW = 1000 some_model = BertModel.from_pretrained( pretrained_model_name_or_path=PRETRAINED_MODEL_NAME) # HyenaDnaMQTLClassifier(seq_len=WINDOW, model_repository_name="hyenadna-sm-32k-mqtl-classifier") criterion = None start_bert( classifier_model=some_model, criterion=criterion, WINDOW=WINDOW, is_debug=False, max_epochs=20, batch_size=16 ) pass