from langchain.document_loaders import PyPDFLoader from transformers import AutoTokenizer from langchain.document_loaders import PyPDFLoader from langchain.schema import Document class DocParsing: chunk_size = 350 chunk_overlap = 50 def __init__(self, file_path, model_name, max_model_tokens=384): self.file_path = file_path # Initialize the tokenizer for all-MiniLM self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.max_model_tokens = max_model_tokens def process_pdf(self): self.load_pdf() self.create_chunks() return self.chunks def load_pdf(self): loader = PyPDFLoader(self.file_path) self.documents = loader.load() def create_chunks(self): # Split documents into chunks self.chunks = [] for doc in self.documents: self.chunks.extend( self.token_split_document( doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap ) ) def tokenize(self, text): return self.tokenizer.encode(text, add_special_tokens=False) def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50): """Split a single Document into multiple Documents based on token length.""" tokens = self.tokenize(doc.page_content) chunks = [] start = 0 while start < len(tokens): end = min(start + chunk_size, len(tokens)) chunk_tokens = tokens[start:end] chunk_text = self.tokenizer.decode(chunk_tokens) # Create a new Document with the same metadata but truncated text chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata) chunks.append(chunk_doc) # Move start forward by chunk_size - chunk_overlap for overlapping context start += chunk_size - chunk_overlap return chunks