tokenizers
Collection
12 items
•
Updated
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained(
'ocisd4/llama_tokenizer_ext_zhtw',
pad_token='<unk>',
add_bos_token=True,
add_eos_token=False
)
#vocab size: 36128
print(tokenizer.tokenize('今天天氣真好!'))
#['▁', '今', '天', '天', '氣', '真', '好', '!']
print(tokenizer.encode('今天天氣真好!'))
#[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
# <s>今天天氣真好!