Upload tokenizer
Browse files- tokenization_phylogpn.py +4 -1
tokenization_phylogpn.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
from transformers import PreTrainedTokenizer
|
2 |
from typing import List, Dict, Optional, Tuple
|
|
|
3 |
|
4 |
class PhyloGPNTokenizer(PreTrainedTokenizer):
|
5 |
model_input_names = ["input_ids"]
|
@@ -24,7 +24,10 @@ class PhyloGPNTokenizer(PreTrainedTokenizer):
|
|
24 |
**kwargs,
|
25 |
)
|
26 |
|
|
|
|
|
27 |
def _tokenize(self, seq: str) -> List[str]:
|
|
|
28 |
return list(seq)
|
29 |
|
30 |
def _convert_token_to_id(self, token: str) -> int:
|
|
|
|
|
1 |
from typing import List, Dict, Optional, Tuple
|
2 |
+
from transformers import PreTrainedTokenizer
|
3 |
|
4 |
class PhyloGPNTokenizer(PreTrainedTokenizer):
|
5 |
model_input_names = ["input_ids"]
|
|
|
24 |
**kwargs,
|
25 |
)
|
26 |
|
27 |
+
self._receptive_field_size = 1
|
28 |
+
|
29 |
def _tokenize(self, seq: str) -> List[str]:
|
30 |
+
assert len(seq) >= 481, "Input must be at least 481 bp long"
|
31 |
return list(seq)
|
32 |
|
33 |
def _convert_token_to_id(self, token: str) -> int:
|