File size: 4,447 Bytes
b585c7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import nltk
import pytest
from tests.utils import wrap_test_forked
def nltkTokenize(text):
words = nltk.word_tokenize(text)
return words
import re
WORD = re.compile(r'\w+')
def regTokenize(text):
words = WORD.findall(text)
return words
import time
@pytest.mark.skipif(not os.getenv('MEASURE'),
reason="For checking token length for various methods: MEASURE=1 pytest -s -v tests/test_tokenizer.py")
@wrap_test_forked
def test_tokenizer1():
prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence:
def fib(n):
a, b = 0, 1
if n == 0 or n == 1:
return a
for i in range(n-2):
a, b = b, a+b
return b
for i in range(10):
print(fib(i))
This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers.
The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated.
In the main part of the program, we use a for loop to call the fib function with different"""
prompt = os.getenv('PROMPT', prompt)
run_tokenizer1(prompt)
def run_tokenizer1(prompt):
from transformers import AutoTokenizer
t = AutoTokenizer.from_pretrained("distilgpt2")
llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b')
from InstructorEmbedding import INSTRUCTOR
emb = INSTRUCTOR('hkunlp/instructor-large')
t0 = time.time()
a = len(regTokenize(prompt))
print("Regexp Tokenizer", a, time.time() - t0)
t0 = time.time()
a = len(nltkTokenize(prompt))
print("NLTK Tokenizer", a, time.time() - t0)
t0 = time.time()
a = len(t(prompt)['input_ids'])
print("Slow Tokenizer", a, time.time() - t0)
t0 = time.time()
a = len(llm_tokenizer(prompt)['input_ids'])
print("Fast Tokenizer LLM", a, time.time() - t0)
t0 = time.time()
a = emb.tokenize([prompt])['input_ids'].shape[1]
print("Instruct Embedding", a, time.time() - t0)
@wrap_test_forked
def test_fake_tokenizer():
from src.utils import FakeTokenizer
t = FakeTokenizer()
assert t.num_tokens_from_string('How are you?') == 4
assert t.num_tokens_from_string('<|endoftext|>') == 7
try:
t.encoding.encode('<|endoftext|>')
raise RuntimeError("Shouldn't reach here")
except ValueError as e:
assert "disallowed special token" in str(e)
@wrap_test_forked
def test_tokenizer_base_model1():
# test separate tokenizer
from tests.test_langchain_units import get_test_model
model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-beta',
tokenizer_base_model='amazon/MistralLite',
prompt_type='human_bot')
assert 'MistralForCausalLM' in str(model)
assert 'amazon/MistralLite' in str(tokenizer)
assert prompt_type == 'human_bot'
print("here")
@wrap_test_forked
def test_tokenizer_base_model2():
# separate tokenizer for vllm, so don't have to share full model, just proxy tokenizer
# if vllm endpoint, we shouldn't fail at all if have invalid base model
from tests.test_langchain_units import get_test_model
model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-omega',
tokenizer_base_model='amazon/MistralLite',
prompt_type='human_bot',
inference_server="vllm:localhost:8080",
max_seq_len=4096)
assert model['base_url'] == 'http://localhost:8080/v1'
assert 'amazon/MistralLite' in str(tokenizer)
assert prompt_type == 'human_bot'
print("here")
if __name__ == '__main__':
test_tokenizer1()
|