import PyPDF2 import unicodedata import re import logging from datamodel.data_model import PDFRawData from .gpt_processor import Translator class PDFProcessor: def __init__(self, file_path: str) -> None: self.file_path = file_path self.file_info = { 'file_name': self.file_path.split('/')[-1], 'file_format': 'PDF', 'total_pages': 0, 'file_content': {}, 'file_full_content': '', } self.__build_info() def __build_info(self) -> None: try: with open(self.file_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) pages = len(pdf_reader.pages) self.file_info['total_pages'] = pages for i, page in enumerate(pdf_reader.pages): text = page.extract_text() text = unicodedata.normalize("NFKD", text) text = text.replace('\n', ' ').replace('\r', '') text = re.sub(' +', ' ', text) self.file_info['is_chinese'] = self.__is_chinese(text) temp = {} logging.info(f"Processing page {i + 1}...") temp['page_num'] = i + 1 tranlator = Translator() temp['page_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text self.file_info['file_content'][i + 1] = temp self.file_info['file_full_content'] = self.file_info['file_full_content'] + temp['page_content'] except FileNotFoundError: print(f"File not found: {self.file_path}") except Exception as e: print(f"An error occurred: {str(e)}") def __is_chinese(self, text: str) -> bool: for char in text: if char >= '\u4e00' and char <= '\u9fff': return True return False