karthikeyan-r commited on
Commit
4834106
1 Parent(s): a48b255

Create pdfProcessor.py

Browse files
Files changed (1) hide show
  1. pdfProcessor.py +36 -0
pdfProcessor.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ import os
3
+ from typing import List
4
+ class PDFProcessor:
5
+ """
6
+ Class for processing PDF files to extract text content.
7
+ """
8
+ def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
9
+ """
10
+ Extract text content from a list of PDF files.
11
+
12
+ Args:
13
+ file_paths (List[str]): A list of file paths to the PDF documents.
14
+
15
+ Returns:
16
+ List[str]: A list of text content extracted from the PDF documents.
17
+ """
18
+ texts = []
19
+ for file_path in file_paths:
20
+ try:
21
+ loader = PyPDFLoader(file_path)
22
+ pages = loader.load_and_split()
23
+
24
+ for page in pages:
25
+ if isinstance(page.page_content, bytes):
26
+ text = page.page_content.decode('utf-8', errors='ignore')
27
+ elif isinstance(page.page_content, str):
28
+ text = page.page_content
29
+ else:
30
+ print(f"Unexpected type: {type(page.page_content)}")
31
+ continue
32
+ texts.append(text)
33
+ except Exception as e:
34
+ print(f"Failed to process {file_path}: {e}")
35
+
36
+ return texts