Spaces:

iblfe
/

test

Runtime error

File size: 4,447 Bytes

b585c7f

import os

import nltk
import pytest

from tests.utils import wrap_test_forked


def nltkTokenize(text):
    words = nltk.word_tokenize(text)
    return words


import re

WORD = re.compile(r'\w+')


def regTokenize(text):
    words = WORD.findall(text)
    return words


import time


@pytest.mark.skipif(not os.getenv('MEASURE'),
                    reason="For checking token length for various methods: MEASURE=1 pytest -s -v tests/test_tokenizer.py")
@wrap_test_forked
def test_tokenizer1():
    prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence:
    
    
    
    
    def fib(n):
        a, b = 0, 1
        if n == 0 or n == 1:
            return a
        for i in range(n-2):
            a, b = b, a+b
        return b
    
    for i in range(10):
        print(fib(i))
    This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers.
    
    The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated.
    
    In the main part of the program, we use a for loop to call the fib function with different"""

    prompt = os.getenv('PROMPT', prompt)
    run_tokenizer1(prompt)


def run_tokenizer1(prompt):
    from transformers import AutoTokenizer

    t = AutoTokenizer.from_pretrained("distilgpt2")
    llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b')

    from InstructorEmbedding import INSTRUCTOR
    emb = INSTRUCTOR('hkunlp/instructor-large')

    t0 = time.time()
    a = len(regTokenize(prompt))
    print("Regexp Tokenizer", a, time.time() - t0)

    t0 = time.time()
    a = len(nltkTokenize(prompt))
    print("NLTK Tokenizer", a, time.time() - t0)

    t0 = time.time()
    a = len(t(prompt)['input_ids'])
    print("Slow Tokenizer", a, time.time() - t0)

    t0 = time.time()
    a = len(llm_tokenizer(prompt)['input_ids'])
    print("Fast Tokenizer LLM", a, time.time() - t0)

    t0 = time.time()
    a = emb.tokenize([prompt])['input_ids'].shape[1]
    print("Instruct Embedding", a, time.time() - t0)


@wrap_test_forked
def test_fake_tokenizer():
    from src.utils import FakeTokenizer
    t = FakeTokenizer()
    assert t.num_tokens_from_string('How are you?') == 4
    assert t.num_tokens_from_string('<|endoftext|>') == 7
    try:
        t.encoding.encode('<|endoftext|>')
        raise RuntimeError("Shouldn't reach here")
    except ValueError as e:
        assert "disallowed special token" in str(e)


@wrap_test_forked
def test_tokenizer_base_model1():
    # test separate tokenizer
    from tests.test_langchain_units import get_test_model
    model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-beta',
                                                               tokenizer_base_model='amazon/MistralLite',
                                                               prompt_type='human_bot')
    assert 'MistralForCausalLM' in str(model)
    assert 'amazon/MistralLite' in str(tokenizer)
    assert prompt_type == 'human_bot'
    print("here")


@wrap_test_forked
def test_tokenizer_base_model2():
    # separate tokenizer for vllm, so don't have to share full model, just proxy tokenizer
    # if vllm endpoint, we shouldn't fail at all if have invalid base model
    from tests.test_langchain_units import get_test_model
    model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-omega',
                                                               tokenizer_base_model='amazon/MistralLite',
                                                               prompt_type='human_bot',
                                                               inference_server="vllm:localhost:8080",
                                                               max_seq_len=4096)
    assert model['base_url'] == 'http://localhost:8080/v1'
    assert 'amazon/MistralLite' in str(tokenizer)
    assert prompt_type == 'human_bot'
    print("here")


if __name__ == '__main__':
    test_tokenizer1()