Repo ID HFValidationError for RWModel(

#35
by patti-j - opened

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in :8 │
│ │
│ 5 #model = "tiiuae/falcon-7b-instruct" │
│ 6 model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True) │
│ 7 │
│ ❱ 8 tokenizer = AutoTokenizer.from_pretrained(model) │
│ 9 pipeline = transformers.pipeline( │
│ 10 │ "text-generation", │
│ 11 │ model=model, │
│ │
│ C:\Users\PattiJorgensen\AppData\Roaming\Python\Python311\site-packages\transformers\models\auto\ │
│ tokenization_auto.py:642 in from_pretrained │
│ │
│ 639 │ │ │ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *input │
│ 640 │ │ │
│ 641 │ │ # Next, let's try to use the tokenizer_config file to get the tokenizer class. │
│ ❱ 642 │ │ tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) │
│ 643 │ │ if "_commit_hash" in tokenizer_config: │
│ 644 │ │ │ kwargs["_commit_hash"] = tokenizer_config["_commit_hash"] │
│ 645 │ │ config_tokenizer_class = tokenizer_config.get("tokenizer_class") │
│ │
│ C:\Users\PattiJorgensen\AppData\Roaming\Python\Python311\site-packages\transformers\models\auto\ │
│ tokenization_auto.py:486 in get_tokenizer_config │
│ │
│ 483 │ tokenizer_config = get_tokenizer_config("tokenizer-test") │
│ 484 │ ```""" │
│ 485 │ commit_hash = kwargs.get("commit_hash", None) │
│ ❱ 486 │ resolved_config_file = cached_file( │
│ 487 │ │ pretrained_model_name_or_path, │
│ 488 │ │ TOKENIZER_CONFIG_FILE, │
│ 489 │ │ cache_dir=cache_dir, │
│ │
│ C:\Users\PattiJorgensen\AppData\Roaming\Python\Python311\site-packages\transformers\utils\hub.py │
│ :409 in cached_file │
│ │
│ 406 │ user_agent = http_user_agent(user_agent) │
│ 407 │ try: │
│ 408 │ │ # Load from URL or cache if already cached │
│ ❱ 409 │ │ resolved_file = hf_hub_download( │
│ 410 │ │ │ path_or_repo_id, │
│ 411 │ │ │ filename, │
│ 412 │ │ │ subfolder=None if len(subfolder) == 0 else subfolder, │
│ │
│ c:\Python311\Lib\site-packages\huggingface_hub\utils_validators.py:110 in inner_fn │
│ │
│ 107 │ │ │ kwargs.items(), # Kwargs values │
│ 108 │ │ ): │
│ 109 │ │ │ if arg_name in ["repo_id", "from_id", "to_id"]: │
│ ❱ 110 │ │ │ │ validate_repo_id(arg_value) │
│ 111 │ │ │ │
│ 112 │ │ │ elif arg_name == "token" and arg_value is not None: │
│ 113 │ │ │ │ has_token = True │
│ │
│ c:\Python311\Lib\site-packages\huggingface_hub\utils_validators.py:164 in validate_repo_id │
│ │
│ 161 │ │ ) │
│ 162 │ │
│ 163 │ if not REPO_ID_REGEX.match(repo_id): │
│ ❱ 164 │ │ raise HFValidationError( │
│ 165 │ │ │ "Repo id must use alphanumeric chars or '-', '
', '.', '--' and '..' are" │
│ 166 │ │ │ " forbidden, '-' and '.' cannot start or end the name, max length is 96:" │
│ 167 │ │ │ f" '{repo_id}'." │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
HFValidationError: Repo id must use alphanumeric chars or '-', '
', '.', '--' and '..' are forbidden, '-' and '.'
cannot start or end the name, max length is 96: 'RWForCausalLM(
(transformer): RWModel(
(word_embeddings): Embedding(65024, 4544)
(h): ModuleList(
(0-31): 32 x DecoderLayer(
(input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
(self_attention): Attention(
(maybe_rotary): RotaryEmbedding()
(query_key_value): Linear(in_features=4544, out_features=4672, bias=False)
(dense): Linear(in_features=4544, out_features=4544, bias=False)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4544, out_features=18176, bias=False)
(act): GELU(approximate='none')
(dense_4h_to_h): Linear(in_features=18176, out_features=4544, bias=False)
)
)
)
(ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)'.

same, but I don't know where in the pipeline there is any repo id

I traced it back to the README file, of all things. HF appears to be validating README files. I was going to report it but haven't had a chance yet. In the meantime I altered my version of HF validation python file to omit this record.

do you remember the name of the file you edited? thanks

Yes, it's validate.py in the
C:\Python311\Lib\site-packages\huggingface_hub\utils_validators.py

This is the offending snippet:
if not REPO_ID_REGEX.match(repo_id):
raise HFValidationError(
"Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
" forbidden, '-' and '.' cannot start or end the name, max length is 96:"
f" '{repo_id}'."
)

I just did a quick-and-dirty to bypass this passage if repo_id = 'RWModel('

Ideally, we should put a bit of code at the top, or better yet in the calling script, to exclude README.md files from validation.

patti-j changed discussion title from Repo ID HFValidationError in tokenizer_config file to Repo ID HFValidationError
patti-j changed discussion title from Repo ID HFValidationError to Repo ID HFValidationError for RWModel(

I got the same error and what worked for me was this. Instead of passing in the model variable add the model path in tokenizer.

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")

@Karthik1611 what did you do for this line: model=AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True)

Same as tokenizer, I passed the model path directly instead of the model variable.

Sign up or log in to comment