from langchain.document_loaders import PyPDFLoader from transformers import AutoTokenizer from langchain.schema import Document class DocParsing: chunk_size = 350 chunk_overlap = 50 def __init__(self, file_path, model_name, max_model_tokens=384): """ Initialize the DocParsing class with the provided file path, model name, and maximum model tokens. Parameters: file_path (str): The path to the PDF file to be processed. model_name (str): The name of the transformer model to be used for tokenization. max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384. Returns: None """ self.file_path = file_path # Initialize the tokenizer for all-MiniLM self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.max_model_tokens = max_model_tokens def process_pdf(self): """ Process the PDF file by loading it, splitting it into chunks, and returning the chunks. This function first calls the `load_pdf` method to load the PDF file into a list of Document objects. Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified chunk size and overlap. Finally, it returns the list of chunks. Parameters: None Returns: list: A list of Document objects, where each Document represents a chunk of the PDF file. """ self.load_pdf() self.create_chunks() return self.chunks def load_pdf(self): """ Load the PDF file specified by the file_path attribute into a list of Document objects. This function uses the PyPDFLoader class from the langchain library to load the PDF file. The loaded Document objects are stored in the self.documents attribute. Parameters: None Returns: None Raises: FileNotFoundError: If the specified file_path does not exist or cannot be accessed. """ loader = PyPDFLoader(self.file_path) self.documents = loader.load() def create_chunks(self): """ Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap. This function iterates through each Document object in the self.documents list and calls the token_split_document method to split the Document into smaller chunks. The resulting chunks are then appended to the self.chunks list. Parameters: None Returns: None Attributes: self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file. """ self.chunks = [] for doc in self.documents: self.chunks.extend( self.token_split_document( doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap ) ) def tokenize(self, text): """ Tokenize the input text using the transformer model's tokenizer. This method uses the tokenizer provided by the transformer model to encode the input text. The special tokens are not added to the encoded tokens. Parameters: text (str): The input text to be tokenized. Returns: list: A list of integers representing the tokenized input text. """ return self.tokenizer.encode(text, add_special_tokens=False) def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50): """ Split a single Document into multiple chunks based on token length. This function tokenizes the input Document's page content, then splits the tokens into smaller chunks of specified size. Overlapping chunks are created by moving the start index forward by the difference between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created with the same metadata but truncated text. Parameters: doc (Document): The input Document to be split into chunks. chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350. chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50. Returns: list: A list of Document objects, where each Document represents a chunk of the input Document. """ tokens = self.tokenize(doc.page_content) chunks = [] start = 0 while start < len(tokens): end = min(start + chunk_size, len(tokens)) chunk_tokens = tokens[start:end] chunk_text = self.tokenizer.decode(chunk_tokens) # Create a new Document with the same metadata but truncated text chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata) chunks.append(chunk_doc) # Move start forward by chunk_size - chunk_overlap for overlapping context start += chunk_size - chunk_overlap return chunks