agoyal496 commited on
Commit
c835cf4
1 Parent(s): 382c3be

Added doc parsing

Browse files
Files changed (1) hide show
  1. utils/document_parsing.py +56 -0
utils/document_parsing.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader
2
+ from transformers import AutoTokenizer
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.schema import Document
5
+
6
+
7
+ class DocParsing:
8
+
9
+ chunk_size = 350
10
+ chunk_overlap = 50
11
+
12
+ def __init__(self, file_path, model_name, max_model_tokens=384):
13
+ self.file_path = file_path
14
+
15
+ # Initialize the tokenizer for all-MiniLM
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+
18
+ self.max_model_tokens = max_model_tokens
19
+
20
+ def process_pdf(self):
21
+ self.load_pdf()
22
+ self.create_chunks()
23
+ return self.chunks
24
+
25
+ def load_pdf(self):
26
+ loader = PyPDFLoader(self.file_path)
27
+ self.documents = loader.load()
28
+
29
+ def create_chunks(self):
30
+ # Split documents into chunks
31
+ self.chunks = []
32
+ for doc in self.documents:
33
+ self.chunks.extend(
34
+ self.token_split_document(
35
+ doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
36
+ )
37
+ )
38
+
39
+ def tokenize(self, text):
40
+ return self.tokenizer.encode(text, add_special_tokens=False)
41
+
42
+ def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
43
+ """Split a single Document into multiple Documents based on token length."""
44
+ tokens = self.tokenize(doc.page_content)
45
+ chunks = []
46
+ start = 0
47
+ while start < len(tokens):
48
+ end = min(start + chunk_size, len(tokens))
49
+ chunk_tokens = tokens[start:end]
50
+ chunk_text = self.tokenizer.decode(chunk_tokens)
51
+ # Create a new Document with the same metadata but truncated text
52
+ chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata)
53
+ chunks.append(chunk_doc)
54
+ # Move start forward by chunk_size - chunk_overlap for overlapping context
55
+ start += chunk_size - chunk_overlap
56
+ return chunks