better handling since xgen tokenizer breaks with convert_tokens_to_ids
Browse files
src/axolotl/prompt_tokenizers.py
CHANGED
@@ -48,16 +48,22 @@ class PromptTokenizingStrategy(abc.ABC):
|
|
48 |
|
49 |
@functools.lru_cache(maxsize=128)
|
50 |
def _get_user_token(self):
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
54 |
return False
|
55 |
|
56 |
@functools.lru_cache(maxsize=128)
|
57 |
def _get_assistant_token(self):
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
61 |
return False
|
62 |
|
63 |
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
|
|
|
48 |
|
49 |
@functools.lru_cache(maxsize=128)
|
50 |
def _get_user_token(self):
|
51 |
+
try:
|
52 |
+
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
|
53 |
+
if isinstance(id_or_ids, (int,)):
|
54 |
+
return id_or_ids
|
55 |
+
except KeyError:
|
56 |
+
pass
|
57 |
return False
|
58 |
|
59 |
@functools.lru_cache(maxsize=128)
|
60 |
def _get_assistant_token(self):
|
61 |
+
try:
|
62 |
+
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
|
63 |
+
if isinstance(id_or_ids, (int,)):
|
64 |
+
return id_or_ids
|
65 |
+
except KeyError:
|
66 |
+
pass
|
67 |
return False
|
68 |
|
69 |
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
|