tmm1 commited on
Commit
e029ab3
1 Parent(s): 8cec513

quiet noise from llama tokenizer by setting pad token earlier

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/models.py +5 -5
src/axolotl/utils/models.py CHANGED
@@ -59,17 +59,17 @@ def load_tokenizer(
59
  **tokenizer_kwargs,
60
  )
61
 
62
- LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
63
- LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
64
- LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
65
- LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
66
-
67
  if tokenizer.__class__.__name__ in [
68
  "LlamaTokenizer",
69
  "LlamaTokenizerFast",
70
  ]:
71
  tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
72
 
 
 
 
 
 
73
  if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
74
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
75
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
59
  **tokenizer_kwargs,
60
  )
61
 
 
 
 
 
 
62
  if tokenizer.__class__.__name__ in [
63
  "LlamaTokenizer",
64
  "LlamaTokenizerFast",
65
  ]:
66
  tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
67
 
68
+ LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
69
+ LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
70
+ LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
71
+ LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
72
+
73
  if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
74
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
75
  os.environ["TOKENIZERS_PARALLELISM"] = "false"