Spaces:
Sleeping
Sleeping
File size: 916 Bytes
31b6e27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
import pytest
from preprocessing import num_tokens_from_string, split_in_chunks
def test_split_in_chunks():
text = "hello world " * 5000 # creates a string with 10000 tokens
chunks = split_in_chunks(text, max_tokens=8191, encoding_name="cl100k_base")
assert len(chunks) == 2
assert num_tokens_from_string(chunks[0], "cl100k_base") <= 8191
assert num_tokens_from_string(chunks[1], "cl100k_base") <= 8191
for chunk_size in [100, 1000, 3000, 5000]:
chunks = split_in_chunks(
text, max_tokens=chunk_size, encoding_name="cl100k_base"
)
for chunk in chunks:
assert num_tokens_from_string(chunk, "cl100k_base") <= chunk_size
text = "This is a short text."
chunks = split_in_chunks(text, max_tokens=8191, encoding_name="cl100k_base")
assert len(chunks) == 1
assert chunks[0] == text
if __name__ == "__main__":
pytest.main()
|