winglian commited on
Commit
85cf4f8
1 Parent(s): 2e22404

better handling of empty input ids when tokenizing (#395)

Browse files

* better handling of empty input ids when tokenizing

* Add warning if tokenizer resulted in empty result

* fix len comparison for linter

Files changed (1) hide show
  1. src/axolotl/prompt_tokenizers.py +4 -1
src/axolotl/prompt_tokenizers.py CHANGED
@@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC):
74
  padding=False,
75
  return_tensors=None,
76
  )
 
 
77
  if (
78
- result["input_ids"][-1] != self.tokenizer.eos_token_id
 
79
  and len(result["input_ids"]) < self.sequence_len
80
  and add_eos_token
81
  ):
 
74
  padding=False,
75
  return_tensors=None,
76
  )
77
+ if len(result["input_ids"]) == 0:
78
+ LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
79
  if (
80
+ len(result["input_ids"]) > 0
81
+ and result["input_ids"][-1] != self.tokenizer.eos_token_id
82
  and len(result["input_ids"]) < self.sequence_len
83
  and add_eos_token
84
  ):