File size: 1,286 Bytes
4834106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from langchain_community.document_loaders import PyPDFLoader
import os
from typing import List
class PDFProcessor:
    """
    Class for processing PDF files to extract text content.
    """
    def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
        """
        Extract text content from a list of PDF files.

        Args:
            file_paths (List[str]): A list of file paths to the PDF documents.

        Returns:
            List[str]: A list of text content extracted from the PDF documents.
        """
        texts = []
        for file_path in file_paths:
            try:
                loader = PyPDFLoader(file_path)
                pages = loader.load_and_split()

                for page in pages:
                    if isinstance(page.page_content, bytes):
                        text = page.page_content.decode('utf-8', errors='ignore')
                    elif isinstance(page.page_content, str):
                        text = page.page_content
                    else:
                        print(f"Unexpected type: {type(page.page_content)}")
                        continue
                    texts.append(text)
            except Exception as e:
                print(f"Failed to process {file_path}: {e}")

        return texts