File size: 5,161 Bytes
c835cf4 24412da c835cf4 24412da c835cf4 24412da c835cf4 24412da c835cf4 24412da c835cf4 24412da c835cf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from langchain.document_loaders import PyPDFLoader
from transformers import AutoTokenizer
from langchain.schema import Document
class DocParsing:
chunk_size = 350
chunk_overlap = 50
def __init__(self, file_path, model_name, max_model_tokens=384):
"""
Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.
Parameters:
file_path (str): The path to the PDF file to be processed.
model_name (str): The name of the transformer model to be used for tokenization.
max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.
Returns:
None
"""
self.file_path = file_path
# Initialize the tokenizer for all-MiniLM
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.max_model_tokens = max_model_tokens
def process_pdf(self):
"""
Process the PDF file by loading it, splitting it into chunks, and returning the chunks.
This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
chunk size and overlap. Finally, it returns the list of chunks.
Parameters:
None
Returns:
list: A list of Document objects, where each Document represents a chunk of the PDF file.
"""
self.load_pdf()
self.create_chunks()
return self.chunks
def load_pdf(self):
"""
Load the PDF file specified by the file_path attribute into a list of Document objects.
This function uses the PyPDFLoader class from the langchain library to load the PDF file.
The loaded Document objects are stored in the self.documents attribute.
Parameters:
None
Returns:
None
Raises:
FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
"""
loader = PyPDFLoader(self.file_path)
self.documents = loader.load()
def create_chunks(self):
"""
Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.
This function iterates through each Document object in the self.documents list and calls the
token_split_document method to split the Document into smaller chunks. The resulting chunks are
then appended to the self.chunks list.
Parameters:
None
Returns:
None
Attributes:
self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
"""
self.chunks = []
for doc in self.documents:
self.chunks.extend(
self.token_split_document(
doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
)
)
def tokenize(self, text):
"""
Tokenize the input text using the transformer model's tokenizer.
This method uses the tokenizer provided by the transformer model to encode the input text.
The special tokens are not added to the encoded tokens.
Parameters:
text (str): The input text to be tokenized.
Returns:
list: A list of integers representing the tokenized input text.
"""
return self.tokenizer.encode(text, add_special_tokens=False)
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
"""
Split a single Document into multiple chunks based on token length.
This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
of specified size. Overlapping chunks are created by moving the start index forward by the difference
between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
with the same metadata but truncated text.
Parameters:
doc (Document): The input Document to be split into chunks.
chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.
Returns:
list: A list of Document objects, where each Document represents a chunk of the input Document.
"""
tokens = self.tokenize(doc.page_content)
chunks = []
start = 0
while start < len(tokens):
end = min(start + chunk_size, len(tokens))
chunk_tokens = tokens[start:end]
chunk_text = self.tokenizer.decode(chunk_tokens)
# Create a new Document with the same metadata but truncated text
chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata)
chunks.append(chunk_doc)
# Move start forward by chunk_size - chunk_overlap for overlapping context
start += chunk_size - chunk_overlap
return chunks
|