calbors commited on
Commit
4dae406
·
verified ·
1 Parent(s): 43f8d11

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenization_phylogpn.py +44 -0
  2. tokenizer_config.json +6 -0
tokenization_phylogpn.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer
2
+ from typing import List, Dict, Optional, Tuple
3
+
4
+ class PhyloGPNTokenizer(PreTrainedTokenizer):
5
+ model_input_names = ["input_ids"]
6
+
7
+ def __init__(self, model_max_length: int = None, unk_token="N", pad_token="-", bos_token=None, eos_token=None, sep_token=None, cls_token=None, mask_token=None, **kwargs):
8
+ self.model_max_length = model_max_length
9
+ self._vocab = {k: v for v, k in enumerate("ACGTN-")}
10
+
11
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
12
+ padding_side = kwargs.pop("padding_side", "left")
13
+ super().__init__(
14
+ model_max_length=model_max_length,
15
+ unk_token=unk_token,
16
+ pad_token=pad_token,
17
+ bos_token=bos_token,
18
+ eos_token=eos_token,
19
+ sep_token=sep_token,
20
+ cls_token=cls_token,
21
+ mask_token=mask_token,
22
+ add_prefix_space=add_prefix_space,
23
+ padding_side=padding_side,
24
+ **kwargs,
25
+ )
26
+
27
+ def _tokenize(self, seq: str) -> List[str]:
28
+ return list(seq)
29
+
30
+ def _convert_token_to_id(self, token: str) -> int:
31
+ return self._vocab.get(token, self._vocab["N"])
32
+
33
+ def _convert_id_to_token(self, idx: int) -> str:
34
+ return self._vocab[idx]
35
+
36
+ @property
37
+ def vocab_size(self) -> int:
38
+ return len(self._vocab)
39
+
40
+ def get_vocab(self) -> Dict[str, int]:
41
+ return self._vocab
42
+
43
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
44
+ return ()
tokenizer_config.json CHANGED
@@ -18,6 +18,12 @@
18
  "special": true
19
  }
20
  },
 
 
 
 
 
 
21
  "bos_token": null,
22
  "clean_up_tokenization_spaces": false,
23
  "cls_token": null,
 
18
  "special": true
19
  }
20
  },
21
+ "auto_map": {
22
+ "AutoTokenizer": [
23
+ "tokenization_phylogpn.PhyloGPNTokenizer",
24
+ null
25
+ ]
26
+ },
27
  "bos_token": null,
28
  "clean_up_tokenization_spaces": false,
29
  "cls_token": null,