|
from langchain.document_loaders import PyPDFLoader |
|
from transformers import AutoTokenizer |
|
from langchain.schema import Document |
|
|
|
|
|
class DocParsing: |
|
|
|
chunk_size = 350 |
|
chunk_overlap = 50 |
|
|
|
def __init__(self, file_path, model_name, max_model_tokens=384): |
|
""" |
|
Initialize the DocParsing class with the provided file path, model name, and maximum model tokens. |
|
|
|
Parameters: |
|
file_path (str): The path to the PDF file to be processed. |
|
model_name (str): The name of the transformer model to be used for tokenization. |
|
max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384. |
|
|
|
Returns: |
|
None |
|
""" |
|
self.file_path = file_path |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
self.max_model_tokens = max_model_tokens |
|
|
|
def process_pdf(self): |
|
""" |
|
Process the PDF file by loading it, splitting it into chunks, and returning the chunks. |
|
|
|
This function first calls the `load_pdf` method to load the PDF file into a list of Document objects. |
|
Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified |
|
chunk size and overlap. Finally, it returns the list of chunks. |
|
|
|
Parameters: |
|
None |
|
|
|
Returns: |
|
list: A list of Document objects, where each Document represents a chunk of the PDF file. |
|
""" |
|
self.load_pdf() |
|
self.create_chunks() |
|
return self.chunks |
|
|
|
def load_pdf(self): |
|
""" |
|
Load the PDF file specified by the file_path attribute into a list of Document objects. |
|
|
|
This function uses the PyPDFLoader class from the langchain library to load the PDF file. |
|
The loaded Document objects are stored in the self.documents attribute. |
|
|
|
Parameters: |
|
None |
|
|
|
Returns: |
|
None |
|
|
|
Raises: |
|
FileNotFoundError: If the specified file_path does not exist or cannot be accessed. |
|
""" |
|
loader = PyPDFLoader(self.file_path) |
|
self.documents = loader.load() |
|
|
|
def create_chunks(self): |
|
""" |
|
Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap. |
|
|
|
This function iterates through each Document object in the self.documents list and calls the |
|
token_split_document method to split the Document into smaller chunks. The resulting chunks are |
|
then appended to the self.chunks list. |
|
|
|
Parameters: |
|
None |
|
|
|
Returns: |
|
None |
|
|
|
Attributes: |
|
self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file. |
|
""" |
|
self.chunks = [] |
|
for doc in self.documents: |
|
self.chunks.extend( |
|
self.token_split_document( |
|
doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap |
|
) |
|
) |
|
|
|
def tokenize(self, text): |
|
""" |
|
Tokenize the input text using the transformer model's tokenizer. |
|
|
|
This method uses the tokenizer provided by the transformer model to encode the input text. |
|
The special tokens are not added to the encoded tokens. |
|
|
|
Parameters: |
|
text (str): The input text to be tokenized. |
|
|
|
Returns: |
|
list: A list of integers representing the tokenized input text. |
|
""" |
|
return self.tokenizer.encode(text, add_special_tokens=False) |
|
|
|
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50): |
|
""" |
|
Split a single Document into multiple chunks based on token length. |
|
|
|
This function tokenizes the input Document's page content, then splits the tokens into smaller chunks |
|
of specified size. Overlapping chunks are created by moving the start index forward by the difference |
|
between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created |
|
with the same metadata but truncated text. |
|
|
|
Parameters: |
|
doc (Document): The input Document to be split into chunks. |
|
chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350. |
|
chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50. |
|
|
|
Returns: |
|
list: A list of Document objects, where each Document represents a chunk of the input Document. |
|
""" |
|
tokens = self.tokenize(doc.page_content) |
|
chunks = [] |
|
start = 0 |
|
while start < len(tokens): |
|
end = min(start + chunk_size, len(tokens)) |
|
chunk_tokens = tokens[start:end] |
|
chunk_text = self.tokenizer.decode(chunk_tokens) |
|
|
|
chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata) |
|
chunks.append(chunk_doc) |
|
|
|
start += chunk_size - chunk_overlap |
|
return chunks |
|
|