YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained(
'ocisd4/openllama_tokenizer_v2',
add_bos_token=False,
add_eos_token=True,
force_download=False,
use_auth_token=True,
# additional_special_tokens=['<|spcout|>', '<|sep|>', '<|eot|>', '<|output|>']
)
print('vocab size:',tokenizer.vocab_size)
#vocab size: 51456
text = '今天天氣真好!'
print(tokenizer.tokenize(text))
#['▁', '今天', '天氣', '真', '好', '!']
print(tokenizer.encode(text))
#[29500, 32097, 32916, 30615, 30192, 30042, 2]
print(tokenizer.decode(tokenizer.encode(text)))
# 今天天氣真好!</s>