Spaces:
Sleeping
Sleeping
| import pytest | |
| from preprocessing import num_tokens_from_string, split_in_chunks | |
| def test_split_in_chunks(): | |
| text = "hello world " * 5000 # creates a string with 10000 tokens | |
| chunks = split_in_chunks(text, max_tokens=8191, encoding_name="cl100k_base") | |
| assert len(chunks) == 2 | |
| assert num_tokens_from_string(chunks[0], "cl100k_base") <= 8191 | |
| assert num_tokens_from_string(chunks[1], "cl100k_base") <= 8191 | |
| for chunk_size in [100, 1000, 3000, 5000]: | |
| chunks = split_in_chunks( | |
| text, max_tokens=chunk_size, encoding_name="cl100k_base" | |
| ) | |
| for chunk in chunks: | |
| assert num_tokens_from_string(chunk, "cl100k_base") <= chunk_size | |
| text = "This is a short text." | |
| chunks = split_in_chunks(text, max_tokens=8191, encoding_name="cl100k_base") | |
| assert len(chunks) == 1 | |
| assert chunks[0] == text | |
| if __name__ == "__main__": | |
| pytest.main() | |