calbors commited on
Commit
c3ae6b4
·
verified ·
1 Parent(s): 8f0220d

Upload tokenizer

Browse files
tokenization_phylogpn.py CHANGED
@@ -9,7 +9,7 @@ class PhyloGPNTokenizer(PreTrainedTokenizer):
9
  self._vocab = {k: v for v, k in enumerate("ACGTN-")}
10
 
11
  add_prefix_space = kwargs.pop("add_prefix_space", False)
12
- padding_side = kwargs.pop("padding_side", "left")
13
  super().__init__(
14
  model_max_length=model_max_length,
15
  unk_token=unk_token,
@@ -24,8 +24,6 @@ class PhyloGPNTokenizer(PreTrainedTokenizer):
24
  **kwargs,
25
  )
26
 
27
- self._receptive_field_size = 1
28
-
29
  def _tokenize(self, seq: str) -> List[str]:
30
  assert len(seq) >= 481, "Input must be at least 481 bp long"
31
  return list(seq)
 
9
  self._vocab = {k: v for v, k in enumerate("ACGTN-")}
10
 
11
  add_prefix_space = kwargs.pop("add_prefix_space", False)
12
+ padding_side = kwargs.pop("padding_side", "right")
13
  super().__init__(
14
  model_max_length=model_max_length,
15
  unk_token=unk_token,
 
24
  **kwargs,
25
  )
26
 
 
 
27
  def _tokenize(self, seq: str) -> List[str]:
28
  assert len(seq) >= 481, "Input must be at least 481 bp long"
29
  return list(seq)
tokenizer_config.json CHANGED
@@ -32,7 +32,7 @@
32
  "mask_token": null,
33
  "model_max_length": 1000000000000000019884624838656,
34
  "pad_token": "-",
35
- "padding_side": "left",
36
  "sep_token": null,
37
  "tokenizer_class": "PhyloGPNTokenizer",
38
  "unk_token": "N"
 
32
  "mask_token": null,
33
  "model_max_length": 1000000000000000019884624838656,
34
  "pad_token": "-",
35
+ "padding_side": "right",
36
  "sep_token": null,
37
  "tokenizer_class": "PhyloGPNTokenizer",
38
  "unk_token": "N"