better handling of empty input ids when tokenizing (#395)
Browse files* better handling of empty input ids when tokenizing
* Add warning if tokenizer resulted in empty result
* fix len comparison for linter
src/axolotl/prompt_tokenizers.py
CHANGED
@@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC):
|
|
74 |
padding=False,
|
75 |
return_tensors=None,
|
76 |
)
|
|
|
|
|
77 |
if (
|
78 |
-
result["input_ids"]
|
|
|
79 |
and len(result["input_ids"]) < self.sequence_len
|
80 |
and add_eos_token
|
81 |
):
|
|
|
74 |
padding=False,
|
75 |
return_tensors=None,
|
76 |
)
|
77 |
+
if len(result["input_ids"]) == 0:
|
78 |
+
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
|
79 |
if (
|
80 |
+
len(result["input_ids"]) > 0
|
81 |
+
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
82 |
and len(result["input_ids"]) < self.sequence_len
|
83 |
and add_eos_token
|
84 |
):
|