AttributeError: add_special_tokens conflicts with the method add_special_tokens in XgenTokenizer

#33
by Avici786 - opened

This is code I am using for using this model
def train():
train_dataset = load_dataset('tatsu-lab/alpaca', split='train')
tokenizer = AutoTokenizer.from_pretrained('Salesforce/xgen-7b-8k-base'
)

tokenizer.pad_token = ""

model = AutoModelForCausalLM.from_pretrainedfrom('Salesforce/xgen-7b-8k-base', load_in_4bit=True, 
                                                 torch_dtype=torch.float16, 
                                                 device_map='auto')
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, 
                         bias='none', 
                         task_type='CAUSAL_LM')
model = get_peft_model(model, peft_config)

training_args = TrainingArguments(
    output_dir = '/kaggle/working/',
    per_device_train_batch_size=4,
    optim='adamw_torch',
    logging_steps=100,
    learning_rate=2e-4,
    fp16=True,
    warmup_ratio=0.1,
    lr_scheduler_type='linear',
    num_train_epochs=2,
    save_strategy='epoch',
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field='text',
    max_seq_length = 1024,
    tokenizer=tokenizer,
    args=training_args,
    packing=True,
    peft_config=peft_config
    
)
trainer.train()

And I am getting this error


AttributeError Traceback (most recent call last)
Cell In[11], line 1
----> 1 train()

Cell In[10], line 3, in train()
1 def train():
2 train_dataset = load_dataset('tatsu-lab/alpaca', split='train')
----> 3 tokenizer = AutoTokenizer.from_pretrained('Salesforce/xgen-7b-8k-base'
4 )
5 # tokenizer.pad_token = ""
6 model = AutoModelForCausalLM.from_pretrainedfrom('Salesforce/xgen-7b-8k-base', load_in_4bit=True,
7 torch_dtype=torch.float16,
8 device_map='auto')

File /opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:905, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
903 if os.path.isdir(pretrained_model_name_or_path):
904 tokenizer_class.register_for_auto_class()
--> 905 return tokenizer_class.from_pretrained(
906 pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
907 )
908 elif config_tokenizer_class is not None:
909 tokenizer_class = None

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2213, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2210 else:
2211 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2213 return cls._from_pretrained(
2214 resolved_vocab_files,
2215 pretrained_model_name_or_path,
2216 init_configuration,
2217 *init_inputs,
2218 token=token,
2219 cache_dir=cache_dir,
2220 local_files_only=local_files_only,
2221 _commit_hash=commit_hash,
2222 _is_local=is_local,
2223 trust_remote_code=trust_remote_code,
2224 **kwargs,
2225 )

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2447, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2445 # Instantiate the tokenizer.
2446 try:
-> 2447 tokenizer = cls(*init_inputs, **init_kwargs)
2448 except import_protobuf_decode_error():
2449 logger.info(
2450 "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
2451 "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
2452 )

File ~/.cache/huggingface/modules/transformers_modules/Salesforce/xgen-7b-8k-base/582db72ba9eb9569ca691abede581596fdbc3057/tokenization_xgen.py:139, in XgenTokenizer.init(self, pad_token, eos_token, add_eos_token, add_special_tokens, **kwargs)
137 self.add_eos_token = add_eos_token
138 self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
--> 139 super().init(
140 pad_token=pad_token_added,
141 eos_token=eos_token_added,
142 add_eos_token=add_eos_token,
143 add_special_tokens=add_special_tokens,
144 **kwargs,
145 )

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils.py:435, in PreTrainedTokenizer.init(self, **kwargs)
432 self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
434 # 4 init the parent class
--> 435 super().init(**kwargs)
437 # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
438 # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following tokenizers
439 self._add_tokens(
440 [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
441 special_tokens=True,
442 )

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1592, in PreTrainedTokenizerBase.init(self, **kwargs)
1590 for key in kwargs:
1591 if hasattr(self, key) and callable(getattr(self, key)):
-> 1592 raise AttributeError(f"{key} conflicts with the method {key} in {self.class.name}")
1594 self.init_kwargs = copy.deepcopy(kwargs)
1595 self.name_or_path = kwargs.pop("name_or_path", "")

AttributeError: add_special_tokens conflicts with the method add_special_tokens in XgenTokenizer

Sign up or log in to comment