|
from langchain.document_loaders import PyPDFLoader |
|
from transformers import AutoTokenizer |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.schema import Document |
|
|
|
|
|
class DocParsing: |
|
|
|
chunk_size = 350 |
|
chunk_overlap = 50 |
|
|
|
def __init__(self, file_path, model_name, max_model_tokens=384): |
|
self.file_path = file_path |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
self.max_model_tokens = max_model_tokens |
|
|
|
def process_pdf(self): |
|
self.load_pdf() |
|
self.create_chunks() |
|
return self.chunks |
|
|
|
def load_pdf(self): |
|
loader = PyPDFLoader(self.file_path) |
|
self.documents = loader.load() |
|
|
|
def create_chunks(self): |
|
|
|
self.chunks = [] |
|
for doc in self.documents: |
|
self.chunks.extend( |
|
self.token_split_document( |
|
doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap |
|
) |
|
) |
|
|
|
def tokenize(self, text): |
|
return self.tokenizer.encode(text, add_special_tokens=False) |
|
|
|
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50): |
|
"""Split a single Document into multiple Documents based on token length.""" |
|
tokens = self.tokenize(doc.page_content) |
|
chunks = [] |
|
start = 0 |
|
while start < len(tokens): |
|
end = min(start + chunk_size, len(tokens)) |
|
chunk_tokens = tokens[start:end] |
|
chunk_text = self.tokenizer.decode(chunk_tokens) |
|
|
|
chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata) |
|
chunks.append(chunk_doc) |
|
|
|
start += chunk_size - chunk_overlap |
|
return chunks |
|
|