Spaces:
Runtime error
Runtime error
Soumic
commited on
Commit
•
df908e2
1
Parent(s):
7bfa8a1
:hammer_and_pick: Looks ok
Browse files
app.py
CHANGED
@@ -2,12 +2,15 @@ import os
|
|
2 |
import random
|
3 |
|
4 |
import huggingface_hub
|
|
|
5 |
from datasets import load_dataset, Dataset
|
6 |
from dotenv import load_dotenv
|
7 |
from pytorch_lightning import LightningDataModule
|
8 |
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
|
9 |
from torch.utils.data import DataLoader, IterableDataset
|
10 |
-
from
|
|
|
|
|
11 |
from transformers import TrainingArguments, Trainer
|
12 |
import torch
|
13 |
import logging
|
@@ -200,12 +203,41 @@ def login_inside_huggingface_virtualmachine():
|
|
200 |
pass
|
201 |
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
def start():
|
204 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
205 |
|
206 |
login_inside_huggingface_virtualmachine()
|
207 |
WINDOW = 4000
|
208 |
batch_size = 100
|
|
|
|
|
|
|
|
|
|
|
209 |
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
|
210 |
classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
|
211 |
torch_dtype=torch.bfloat16,
|
@@ -256,6 +288,7 @@ def start():
|
|
256 |
args=training_args,
|
257 |
train_dataset=train_ds,
|
258 |
eval_dataset=val_ds,
|
|
|
259 |
)
|
260 |
# train, and validate
|
261 |
result = trainer.train()
|
@@ -274,11 +307,8 @@ def start():
|
|
274 |
finally:
|
275 |
# save the model
|
276 |
model_name = "HyenaDnaMQtlClassifier"
|
277 |
-
is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
|
278 |
-
model_repository_name = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
|
279 |
|
280 |
-
|
281 |
-
classifier_model.save_pretrained(save_directory=model_subdirectory, safe_serialization=False)
|
282 |
|
283 |
# push to the hub
|
284 |
commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
|
@@ -286,7 +316,7 @@ def start():
|
|
286 |
commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
|
287 |
|
288 |
classifier_model.push_to_hub(
|
289 |
-
repo_id=
|
290 |
# subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
|
291 |
commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
|
292 |
safe_serialization=False
|
@@ -294,6 +324,20 @@ def start():
|
|
294 |
pass
|
295 |
|
296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
if __name__ == '__main__':
|
298 |
start()
|
299 |
pass
|
|
|
2 |
import random
|
3 |
|
4 |
import huggingface_hub
|
5 |
+
import numpy as np
|
6 |
from datasets import load_dataset, Dataset
|
7 |
from dotenv import load_dotenv
|
8 |
from pytorch_lightning import LightningDataModule
|
9 |
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
|
10 |
from torch.utils.data import DataLoader, IterableDataset
|
11 |
+
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
|
12 |
+
# from torchmetrics.classification import BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinaryPrecision, BinaryRecall
|
13 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
|
14 |
from transformers import TrainingArguments, Trainer
|
15 |
import torch
|
16 |
import logging
|
|
|
203 |
pass
|
204 |
|
205 |
|
206 |
+
# use sklearn cz torchmetrics.classification gave array index out of bound exception :/ (whatever it is called in python)
|
207 |
+
def compute_metrics_using_sklearn(p):
|
208 |
+
try:
|
209 |
+
pred, labels = p
|
210 |
+
|
211 |
+
# Get predicted class labels
|
212 |
+
pred_labels = np.argmax(pred, axis=1)
|
213 |
+
|
214 |
+
# Get predicted probabilities for the positive class
|
215 |
+
pred_probs = pred[:, 1] # Assuming binary classification and 2 output classes
|
216 |
+
|
217 |
+
accuracy = accuracy_score(y_true=labels, y_pred=pred_labels)
|
218 |
+
recall = recall_score(y_true=labels, y_pred=pred_labels)
|
219 |
+
precision = precision_score(y_true=labels, y_pred=pred_labels)
|
220 |
+
f1 = f1_score(y_true=labels, y_pred=pred_labels)
|
221 |
+
roc_auc = roc_auc_score(y_true=labels, y_score=pred_probs)
|
222 |
+
|
223 |
+
return {"accuracy": accuracy, "roc_auc": roc_auc, "precision": precision, "recall": recall, "f1": f1}
|
224 |
+
|
225 |
+
except Exception as x:
|
226 |
+
print(f"compute_metrics_using_sklearn failed with exception: {x}")
|
227 |
+
return {"accuracy": 0, "roc_auc": 0, "precision": 0, "recall": 0, "f1": 0}
|
228 |
+
|
229 |
+
|
230 |
def start():
|
231 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
232 |
|
233 |
login_inside_huggingface_virtualmachine()
|
234 |
WINDOW = 4000
|
235 |
batch_size = 100
|
236 |
+
model_local_directory = f"my-awesome-model-{WINDOW}"
|
237 |
+
model_remote_repository = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
|
238 |
+
|
239 |
+
is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
|
240 |
+
|
241 |
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
|
242 |
classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
|
243 |
torch_dtype=torch.bfloat16,
|
|
|
288 |
args=training_args,
|
289 |
train_dataset=train_ds,
|
290 |
eval_dataset=val_ds,
|
291 |
+
compute_metrics=compute_metrics_using_sklearn # torch_metrics.compute_metrics
|
292 |
)
|
293 |
# train, and validate
|
294 |
result = trainer.train()
|
|
|
307 |
finally:
|
308 |
# save the model
|
309 |
model_name = "HyenaDnaMQtlClassifier"
|
|
|
|
|
310 |
|
311 |
+
classifier_model.save_pretrained(save_directory=model_local_directory, safe_serialization=False)
|
|
|
312 |
|
313 |
# push to the hub
|
314 |
commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
|
|
|
316 |
commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
|
317 |
|
318 |
classifier_model.push_to_hub(
|
319 |
+
repo_id=model_remote_repository,
|
320 |
# subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
|
321 |
commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
|
322 |
safe_serialization=False
|
|
|
324 |
pass
|
325 |
|
326 |
|
327 |
+
def interprete_demo():
|
328 |
+
is_my_laptop = True
|
329 |
+
WINDOW = 4000
|
330 |
+
batch_size = 100
|
331 |
+
model_local_directory = f"my-awesome-model-{WINDOW}"
|
332 |
+
model_remote_repository = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
|
333 |
+
|
334 |
+
try:
|
335 |
+
classifier_model = AutoModel.from_pretrained(model_remote_repository)
|
336 |
+
# todo: use captum / gentech-grelu to interpret the model
|
337 |
+
except Exception as x:
|
338 |
+
print(x)
|
339 |
+
|
340 |
+
|
341 |
if __name__ == '__main__':
|
342 |
start()
|
343 |
pass
|