Soumic commited on
Commit
df908e2
1 Parent(s): 7bfa8a1

:hammer_and_pick: Looks ok

Browse files
Files changed (1) hide show
  1. app.py +50 -6
app.py CHANGED
@@ -2,12 +2,15 @@ import os
2
  import random
3
 
4
  import huggingface_hub
 
5
  from datasets import load_dataset, Dataset
6
  from dotenv import load_dotenv
7
  from pytorch_lightning import LightningDataModule
8
  from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
9
  from torch.utils.data import DataLoader, IterableDataset
10
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 
11
  from transformers import TrainingArguments, Trainer
12
  import torch
13
  import logging
@@ -200,12 +203,41 @@ def login_inside_huggingface_virtualmachine():
200
  pass
201
 
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def start():
204
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
205
 
206
  login_inside_huggingface_virtualmachine()
207
  WINDOW = 4000
208
  batch_size = 100
 
 
 
 
 
209
  tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
210
  classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
211
  torch_dtype=torch.bfloat16,
@@ -256,6 +288,7 @@ def start():
256
  args=training_args,
257
  train_dataset=train_ds,
258
  eval_dataset=val_ds,
 
259
  )
260
  # train, and validate
261
  result = trainer.train()
@@ -274,11 +307,8 @@ def start():
274
  finally:
275
  # save the model
276
  model_name = "HyenaDnaMQtlClassifier"
277
- is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
278
- model_repository_name = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
279
 
280
- model_subdirectory = f"my-awesome-model-{WINDOW}"
281
- classifier_model.save_pretrained(save_directory=model_subdirectory, safe_serialization=False)
282
 
283
  # push to the hub
284
  commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
@@ -286,7 +316,7 @@ def start():
286
  commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
287
 
288
  classifier_model.push_to_hub(
289
- repo_id=model_repository_name,
290
  # subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
291
  commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
292
  safe_serialization=False
@@ -294,6 +324,20 @@ def start():
294
  pass
295
 
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  if __name__ == '__main__':
298
  start()
299
  pass
 
2
  import random
3
 
4
  import huggingface_hub
5
+ import numpy as np
6
  from datasets import load_dataset, Dataset
7
  from dotenv import load_dotenv
8
  from pytorch_lightning import LightningDataModule
9
  from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
10
  from torch.utils.data import DataLoader, IterableDataset
11
+ from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
12
+ # from torchmetrics.classification import BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinaryPrecision, BinaryRecall
13
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
14
  from transformers import TrainingArguments, Trainer
15
  import torch
16
  import logging
 
203
  pass
204
 
205
 
206
+ # use sklearn cz torchmetrics.classification gave array index out of bound exception :/ (whatever it is called in python)
207
+ def compute_metrics_using_sklearn(p):
208
+ try:
209
+ pred, labels = p
210
+
211
+ # Get predicted class labels
212
+ pred_labels = np.argmax(pred, axis=1)
213
+
214
+ # Get predicted probabilities for the positive class
215
+ pred_probs = pred[:, 1] # Assuming binary classification and 2 output classes
216
+
217
+ accuracy = accuracy_score(y_true=labels, y_pred=pred_labels)
218
+ recall = recall_score(y_true=labels, y_pred=pred_labels)
219
+ precision = precision_score(y_true=labels, y_pred=pred_labels)
220
+ f1 = f1_score(y_true=labels, y_pred=pred_labels)
221
+ roc_auc = roc_auc_score(y_true=labels, y_score=pred_probs)
222
+
223
+ return {"accuracy": accuracy, "roc_auc": roc_auc, "precision": precision, "recall": recall, "f1": f1}
224
+
225
+ except Exception as x:
226
+ print(f"compute_metrics_using_sklearn failed with exception: {x}")
227
+ return {"accuracy": 0, "roc_auc": 0, "precision": 0, "recall": 0, "f1": 0}
228
+
229
+
230
  def start():
231
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
232
 
233
  login_inside_huggingface_virtualmachine()
234
  WINDOW = 4000
235
  batch_size = 100
236
+ model_local_directory = f"my-awesome-model-{WINDOW}"
237
+ model_remote_repository = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
238
+
239
+ is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
240
+
241
  tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
242
  classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
243
  torch_dtype=torch.bfloat16,
 
288
  args=training_args,
289
  train_dataset=train_ds,
290
  eval_dataset=val_ds,
291
+ compute_metrics=compute_metrics_using_sklearn # torch_metrics.compute_metrics
292
  )
293
  # train, and validate
294
  result = trainer.train()
 
307
  finally:
308
  # save the model
309
  model_name = "HyenaDnaMQtlClassifier"
 
 
310
 
311
+ classifier_model.save_pretrained(save_directory=model_local_directory, safe_serialization=False)
 
312
 
313
  # push to the hub
314
  commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
 
316
  commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
317
 
318
  classifier_model.push_to_hub(
319
+ repo_id=model_remote_repository,
320
  # subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
321
  commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
322
  safe_serialization=False
 
324
  pass
325
 
326
 
327
+ def interprete_demo():
328
+ is_my_laptop = True
329
+ WINDOW = 4000
330
+ batch_size = 100
331
+ model_local_directory = f"my-awesome-model-{WINDOW}"
332
+ model_remote_repository = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
333
+
334
+ try:
335
+ classifier_model = AutoModel.from_pretrained(model_remote_repository)
336
+ # todo: use captum / gentech-grelu to interpret the model
337
+ except Exception as x:
338
+ print(x)
339
+
340
+
341
  if __name__ == '__main__':
342
  start()
343
  pass