Spaces:

agoyal496
/

AskMyPDF

Sleeping

App Files Files Community

AskMyPDF / utils /document_parsing.py

agoyal496

Remove redundant import

f61e0bf 29 days ago

raw

history blame contribute delete

5.16 kB

	from langchain.document_loaders import PyPDFLoader
	from transformers import AutoTokenizer
	from langchain.schema import Document


	class DocParsing:

	chunk_size = 350
	chunk_overlap = 50

	def __init__(self, file_path, model_name, max_model_tokens=384):
	"""
	Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.

	Parameters:
	file_path (str): The path to the PDF file to be processed.
	model_name (str): The name of the transformer model to be used for tokenization.
	max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.

	Returns:
	None
	"""
	self.file_path = file_path

	# Initialize the tokenizer for all-MiniLM
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	self.max_model_tokens = max_model_tokens

	def process_pdf(self):
	"""
	Process the PDF file by loading it, splitting it into chunks, and returning the chunks.

	This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
	Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
	chunk size and overlap. Finally, it returns the list of chunks.

	Parameters:
	None

	Returns:
	list: A list of Document objects, where each Document represents a chunk of the PDF file.
	"""
	self.load_pdf()
	self.create_chunks()
	return self.chunks

	def load_pdf(self):
	"""
	Load the PDF file specified by the file_path attribute into a list of Document objects.

	This function uses the PyPDFLoader class from the langchain library to load the PDF file.
	The loaded Document objects are stored in the self.documents attribute.

	Parameters:
	None

	Returns:
	None

	Raises:
	FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
	"""
	loader = PyPDFLoader(self.file_path)
	self.documents = loader.load()

	def create_chunks(self):
	"""
	Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.

	This function iterates through each Document object in the self.documents list and calls the
	token_split_document method to split the Document into smaller chunks. The resulting chunks are
	then appended to the self.chunks list.

	Parameters:
	None

	Returns:
	None

	Attributes:
	self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
	"""
	self.chunks = []
	for doc in self.documents:
	self.chunks.extend(
	self.token_split_document(
	doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
	)
	)

	def tokenize(self, text):
	"""
	Tokenize the input text using the transformer model's tokenizer.

	This method uses the tokenizer provided by the transformer model to encode the input text.
	The special tokens are not added to the encoded tokens.

	Parameters:
	text (str): The input text to be tokenized.

	Returns:
	list: A list of integers representing the tokenized input text.
	"""
	return self.tokenizer.encode(text, add_special_tokens=False)

	def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
	"""
	Split a single Document into multiple chunks based on token length.

	This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
	of specified size. Overlapping chunks are created by moving the start index forward by the difference
	between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
	with the same metadata but truncated text.

	Parameters:
	doc (Document): The input Document to be split into chunks.
	chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
	chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.

	Returns:
	list: A list of Document objects, where each Document represents a chunk of the input Document.
	"""
	tokens = self.tokenize(doc.page_content)
	chunks = []
	start = 0
	while start < len(tokens):
	end = min(start + chunk_size, len(tokens))
	chunk_tokens = tokens[start:end]
	chunk_text = self.tokenizer.decode(chunk_tokens)
	# Create a new Document with the same metadata but truncated text
	chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata)
	chunks.append(chunk_doc)
	# Move start forward by chunk_size - chunk_overlap for overlapping context
	start += chunk_size - chunk_overlap
	return chunks