import PyPDF2 import unicodedata import re import logging from .gpt_processor import Translator class PDFProcessor: def __init__(self, file_path: str) -> None: self.file_path = file_path self.file_info = { "file_name": self.file_path.split("/")[-1], "file_format": "PDF", "total_pages": 0, "file_content": {}, "file_full_content": "", "is_chinese": "", } self.__build_info() def __build_info(self) -> None: try: with open(self.file_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) pages = len(pdf_reader.pages) self.file_info["total_pages"] = pages for i, page in enumerate(pdf_reader.pages): text = page.extract_text() text = unicodedata.normalize("NFKD", text) text = text.replace("\n", " ").replace("\r", "") text = re.sub(" +", " ", text) self.file_info["is_chinese"] = self.__is_chinese(text) page_info = {} logging.info(f"Processing page {i + 1}...") page_info["page_num"] = i + 1 page_info["page_content"] = text self.file_info["file_content"][i + 1] = page_info self.file_info["file_full_content"] = ( self.file_info["file_full_content"] + page_info["page_content"] ) except FileNotFoundError: print(f"File not found: {self.file_path}") except Exception as e: print(f"An error occurred: {str(e)}") def __is_chinese(self, text: str) -> bool: for char in text: if char >= "\u4e00" and char <= "\u9fff": return True return False