|
from tokenizers import Tokenizer |
|
from tokenizers.models import BPE |
|
from transformers import PreTrainedTokenizerFast |
|
|
|
|
|
class gLM2Tokenizer(PreTrainedTokenizerFast): |
|
|
|
VOCAB = [ |
|
"<cls>", "<pad>", "<eos>", "<unk>", |
|
"L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", |
|
"Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", |
|
"O", "a", "t", "c", "g", "<+>", "<->", "<mask>", "<sep>", |
|
] |
|
|
|
def __init__( |
|
self, |
|
unk_token="<unk>", |
|
cls_token="<cls>", |
|
pad_token="<pad>", |
|
mask_token="<mask>", |
|
eos_token="<eos>", |
|
sep_token="<sep>", |
|
pos_token="<+>", |
|
neg_token="<->", |
|
**kwargs, |
|
): |
|
all_tokens = self.VOCAB |
|
token_to_id = {tok: ind for ind, tok in enumerate(all_tokens)} |
|
|
|
bpe = BPE(token_to_id, merges=[], unk_token=str(unk_token)) |
|
tokenizer = Tokenizer(bpe) |
|
special_tokens = [cls_token, pad_token, |
|
mask_token, eos_token, sep_token, pos_token, neg_token] |
|
|
|
tokenizer.add_special_tokens( |
|
special_tokens, |
|
) |
|
|
|
super().__init__( |
|
tokenizer_object=tokenizer, |
|
unk_token=unk_token, |
|
cls_token=cls_token, |
|
pad_token=pad_token, |
|
mask_token=mask_token, |
|
eos_token=eos_token, |
|
sep_token=sep_token, |
|
**kwargs, |
|
) |
|
|