togethercomputer/StripedHyena-Nous-7B · Unable to load tokenizer

Dec 11, 2023

import torch
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer

model_id = "togethercomputer/StripedHyena-Nous-7B"

# Config
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

# Tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
except:
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)

I get the following error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[4], line 3
      2 try:
----> 3     tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
      4 except:

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/configuration_utils.py:265, in PretrainedConfig.__getattribute__(self, key)
    264     key = super().__getattribute__("attribute_map")[key]
--> 265 return super().__getattribute__(key)

AttributeError: 'StripedHyenaConfig' object has no attribute 'tokenizer_name'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
Cell In[4], line 5
      3     tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
      4 except:
----> 5     tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:787, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    783     if tokenizer_class is None:
    784         raise ValueError(
    785             f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
    786         )
--> 787     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    789 # Otherwise we have to be creative.
    790 # if model is an encoder decoder, the encoder tokenizer class is used by default
    791 if isinstance(config, EncoderDecoderConfig):

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2028, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
   2025     else:
   2026         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2028 return cls._from_pretrained(
   2029     resolved_vocab_files,
   2030     pretrained_model_name_or_path,
   2031     init_configuration,
   2032     *init_inputs,
   2033     token=token,
   2034     cache_dir=cache_dir,
   2035     local_files_only=local_files_only,
   2036     _commit_hash=commit_hash,
   2037     _is_local=is_local,
   2038     **kwargs,
   2039 )

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2260, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
   2258 # Instantiate the tokenizer.
   2259 try:
-> 2260     tokenizer = cls(*init_inputs, **init_kwargs)
   2261 except OSError:
   2262     raise OSError(
   2263         "Unable to load vocabulary from file. "
   2264         "Please check that the provided vocabulary is accessible and not corrupted."
   2265     )

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/models/llama/tokenization_llama.py:178, in LlamaTokenizer.__init__(self, vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token, clean_up_tokenization_spaces, use_default_system_prompt, spaces_between_special_tokens, legacy, **kwargs)
    176 self.add_eos_token = add_eos_token
    177 self.use_default_system_prompt = use_default_system_prompt
--> 178 self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
    180 super().__init__(
    181     bos_token=bos_token,
    182     eos_token=eos_token,
   (...)
    192     **kwargs,
    193 )

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/models/llama/tokenization_llama.py:203, in LlamaTokenizer.get_spm_processor(self, from_slow)
    201 tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
    202 if self.legacy or from_slow:  # no dependency on protobuf
--> 203     tokenizer.Load(self.vocab_file)
    204     return tokenizer
    206 with open(self.vocab_file, "rb") as f:

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/sentencepiece/__init__.py:905, in SentencePieceProcessor.Load(self, model_file, model_proto)
    903 if model_proto:
    904   return self.LoadFromSerializedProto(model_proto)
--> 905 return self.LoadFromFile(model_file)

File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/sentencepiece/__init__.py:310, in SentencePieceProcessor.LoadFromFile(self, arg)
    309 def LoadFromFile(self, arg):
--> 310     return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)

TypeError: not a string

Thanks for the model update!

nicklikets

Dec 12, 2023

When loading the config using AutoConfig, the attribute config.tokenizer_name does not exist.

Is there a reason why you are using the parameter use_fast=False on the exception block and not on the initial try block? As the tokeniser would load as expected removing use_fast from the except block where you are loading the model from the model_id

abhinavkulkarni

Dec 14, 2023

Thanks @nicklikets , use_fast=True works.

abhinavkulkarni changed discussion status to closed Dec 14, 2023