File size: 2,514 Bytes
47d601f 1ffa386 47d601f efb3b2c 47d601f efb3b2c 47d601f efb3b2c 47d601f 1ffa386 25e037f 47d601f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
"""
Test cases for the tokenizer loading
"""
import unittest
import pytest
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_tokenizer
class TestTokenizers(unittest.TestCase):
"""
test class for the load_tokenizer fn
"""
def test_default_use_fast(self):
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
}
)
tokenizer = load_tokenizer(cfg)
assert "Fast" in tokenizer.__class__.__name__
def test_dont_use_fast(self):
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"tokenizer_use_fast": False,
}
)
tokenizer = load_tokenizer(cfg)
assert "Fast" not in tokenizer.__class__.__name__
def test_special_tokens_modules_to_save(self):
# setting special_tokens to new token
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"adapter": "lora",
"special_tokens": {"bos_token": "[INST]"},
}
)
with pytest.raises(
ValueError,
match=r".*Please set lora_modules_to_save*",
):
load_tokenizer(cfg)
# setting special_tokens but not changing from default
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"adapter": "lora",
"special_tokens": {"bos_token": "<s>"},
}
)
load_tokenizer(cfg)
# non-adapter setting special_tokens
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"special_tokens": {"bos_token": "[INST]"},
}
)
load_tokenizer(cfg)
def test_add_additional_special_tokens(self):
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"special_tokens": {"additional_special_tokens": ["<|im_start|>"]},
}
)
tokenizer = load_tokenizer(cfg)
self.assertEqual(tokenizer("<|im_start|>user")["input_ids"], [1, 32000, 1404])
self.assertEqual(len(tokenizer), 32001)
# ensure reloading the tokenizer again from cfg results in same vocab length
tokenizer = load_tokenizer(cfg)
self.assertEqual(len(tokenizer), 32001)
if __name__ == "__main__":
unittest.main()
|