calbors commited on
Commit
8f0220d
·
verified ·
1 Parent(s): a723648

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenization_phylogpn.py +4 -1
tokenization_phylogpn.py CHANGED
@@ -1,5 +1,5 @@
1
- from transformers import PreTrainedTokenizer
2
  from typing import List, Dict, Optional, Tuple
 
3
 
4
  class PhyloGPNTokenizer(PreTrainedTokenizer):
5
  model_input_names = ["input_ids"]
@@ -24,7 +24,10 @@ class PhyloGPNTokenizer(PreTrainedTokenizer):
24
  **kwargs,
25
  )
26
 
 
 
27
  def _tokenize(self, seq: str) -> List[str]:
 
28
  return list(seq)
29
 
30
  def _convert_token_to_id(self, token: str) -> int:
 
 
1
  from typing import List, Dict, Optional, Tuple
2
+ from transformers import PreTrainedTokenizer
3
 
4
  class PhyloGPNTokenizer(PreTrainedTokenizer):
5
  model_input_names = ["input_ids"]
 
24
  **kwargs,
25
  )
26
 
27
+ self._receptive_field_size = 1
28
+
29
  def _tokenize(self, seq: str) -> List[str]:
30
+ assert len(seq) >= 481, "Input must be at least 481 bp long"
31
  return list(seq)
32
 
33
  def _convert_token_to_id(self, token: str) -> int: