File size: 2,657 Bytes
0b11a42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import os
from dataclasses import dataclass, field
from pickletools import int4
from typing import List
@dataclass
class GeneEmbeddModelConfig:
# input dim for the embedding and positional encoders
# as well as all k,q,v input/output dims
model_input: str = "seq-struct"
num_embed_hidden: int = 256
ff_hidden_dim: List = field(default_factory=lambda: [1200, 800])
feed_forward1_hidden: int = 1024
num_attention_project: int = 64
num_encoder_layers: int = 2
dropout: float = 0.3
n: int = 121
window:int = 4
relative_attns: List = field(default_factory=lambda: [int(360), int(360)])
num_attention_heads: int = 4
tokens_len: int = 0 #will be infered later
second_input_token_len:int = 0 # is infered in runtime
vocab_size: int = 0 # is infered in runtime
second_input_vocab_size: int = 0 # is infered in runtime
tokenizer: str = (
"overlap" # either overlap or no_overlap or overlap_multi_window
)
# how many extra window sizes other than deafault window
num_classes: int = 0 #will be infered in runtime
class_weights :List = field(default_factory=lambda: [])
tokens_mapping_dict: dict = None
#false input percentage
false_input_perc:float = 0.2
model_input: str = "seq-struct"
@dataclass
class GeneEmbeddTrainConfig:
dataset_path_train: str = "/data/hbdx_ldap_local/analysis/data/sncRNA/train.h5ad"
dataset_path_test: str = "/data/hbdx_ldap_local/analysis/data/sncRNA/test.h5ad"
labels_mapping_path:str = "/data/hbdx_ldap_local/analysis/data/sncRNA/labels_mapping_dict.pkl"
device: str = "cuda"
l2_weight_decay: float = 1e-5
batch_size: int = 64
batch_per_epoch:int = 0 #will be infered later
label_smoothing_sim:float = 0.0
label_smoothing_clf:float = 0.0
# learning rate
learning_rate: float = 1e-3 # final learning rate ie 'lr annealed to'
lr_warmup_start: float = 0.1 # start of lr before initial linear warmup section
lr_warmup_end: float = 1 # end of linear warmup section , annealing begin
# TODO: 122 is the number of train batches per epoch, should be infered and set
# warmup batch should be in the form epoch*(train batch per epoch)
warmup_epoch: int = 10 # how many batches linear warm up for
final_epoch: int = 20 # final batch of training when want learning rate
top_k: int = int(
0.05 * batch_size
) # if the corresponding rna/GE appears in the top k, the correctly classified
label_smoothing: float = 0.0
cross_val: bool = False
filter_seq_length:bool = True
train_epoch: int = 800
max_epochs:int = 1000 |