fix can't set attribute 'eos_token' when loading the saved tokenizer
#27
by
hiyouga
- opened
Reproduce
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
tok.save_pretrained("original")
# ('original/tokenizer_config.json', 'original/special_tokens_map.json', 'original/tokenizer.model', 'original/added_tokens.json')
tok = AutoTokenizer.from_pretrained("original", trust_remote_code=True)
It throws
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/site-packages/transformers/models/auto/tokenization_auto.py", line 774, in from_pretrained
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
File "/site-packages/transformers/tokenization_utils_base.py", line 2028, in from_pretrained
return cls._from_pretrained(
File "/site-packages/transformers/tokenization_utils_base.py", line 2260, in _from_pretrained
tokenizer = cls(*init_inputs, **init_kwargs)
File "huggingface/modules/transformers_modules/original/tokenization_chatglm.py", line 108, in __init__
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
File "/site-packages/transformers/tokenization_utils.py", line 363, in __init__
super().__init__(**kwargs)
File "/site-packages/transformers/tokenization_utils_base.py", line 1602, in __init__
super().__init__(**kwargs)
File "/site-packages/transformers/tokenization_utils_base.py", line 861, in __init__
setattr(self, key, value)
AttributeError: can't set attribute 'eos_token'
After fix
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
tok.save_pretrained("new")
# ('new/tokenizer_config.json', 'new/special_tokens_map.json', 'new/tokenizer.model', 'new/added_tokens.json')
tok = AutoTokenizer.from_pretrained("new", trust_remote_code=True)
The tokenizer can be correctly loaded while spitting out the following info.
Setting eos_token is not supported, use the default one.
Setting pad_token is not supported, use the default one.
Setting unk_token is not supported, use the default one.
zRzRzRzRzRzRzR
changed pull request status to
merged