File size: 5,161 Bytes
c835cf4
 
 
 
 
 
 
 
 
 
 
24412da
 
 
 
 
 
 
 
 
 
 
c835cf4
 
 
 
 
 
 
 
24412da
 
 
 
 
 
 
 
 
 
 
 
 
c835cf4
 
 
 
 
24412da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c835cf4
 
 
 
24412da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c835cf4
 
 
 
 
 
 
 
 
24412da
 
 
 
 
 
 
 
 
 
 
 
c835cf4
 
 
24412da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c835cf4
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from langchain.document_loaders import PyPDFLoader
from transformers import AutoTokenizer
from langchain.schema import Document


class DocParsing:

    chunk_size = 350
    chunk_overlap = 50

    def __init__(self, file_path, model_name, max_model_tokens=384):
        """
        Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.

        Parameters:
        file_path (str): The path to the PDF file to be processed.
        model_name (str): The name of the transformer model to be used for tokenization.
        max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.

        Returns:
        None
        """
        self.file_path = file_path

        # Initialize the tokenizer for all-MiniLM
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.max_model_tokens = max_model_tokens

    def process_pdf(self):
        """
        Process the PDF file by loading it, splitting it into chunks, and returning the chunks.

        This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
        Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
        chunk size and overlap. Finally, it returns the list of chunks.

        Parameters:
        None

        Returns:
        list: A list of Document objects, where each Document represents a chunk of the PDF file.
        """
        self.load_pdf()
        self.create_chunks()
        return self.chunks

    def load_pdf(self):
        """
        Load the PDF file specified by the file_path attribute into a list of Document objects.

        This function uses the PyPDFLoader class from the langchain library to load the PDF file.
        The loaded Document objects are stored in the self.documents attribute.

        Parameters:
        None

        Returns:
        None

        Raises:
        FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
        """
        loader = PyPDFLoader(self.file_path)
        self.documents = loader.load()

    def create_chunks(self):
        """
        Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.

        This function iterates through each Document object in the self.documents list and calls the
        token_split_document method to split the Document into smaller chunks. The resulting chunks are
        then appended to the self.chunks list.

        Parameters:
        None

        Returns:
        None

        Attributes:
        self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
        """
        self.chunks = []
        for doc in self.documents:
            self.chunks.extend(
                self.token_split_document(
                    doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
                )
            )

    def tokenize(self, text):
        """
        Tokenize the input text using the transformer model's tokenizer.

        This method uses the tokenizer provided by the transformer model to encode the input text.
        The special tokens are not added to the encoded tokens.

        Parameters:
        text (str): The input text to be tokenized.

        Returns:
        list: A list of integers representing the tokenized input text.
        """
        return self.tokenizer.encode(text, add_special_tokens=False)

    def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
        """
        Split a single Document into multiple chunks based on token length.

        This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
        of specified size. Overlapping chunks are created by moving the start index forward by the difference
        between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
        with the same metadata but truncated text.

        Parameters:
        doc (Document): The input Document to be split into chunks.
        chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
        chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.

        Returns:
        list: A list of Document objects, where each Document represents a chunk of the input Document.
        """
        tokens = self.tokenize(doc.page_content)
        chunks = []
        start = 0
        while start < len(tokens):
            end = min(start + chunk_size, len(tokens))
            chunk_tokens = tokens[start:end]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            # Create a new Document with the same metadata but truncated text
            chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata)
            chunks.append(chunk_doc)
            # Move start forward by chunk_size - chunk_overlap for overlapping context
            start += chunk_size - chunk_overlap
        return chunks