Spaces:

ChenyuRabbitLove
/

junyi_bot_external

Runtime error

File size: 1,904 Bytes

f807e7d
 
 
 
 
 
 
26f62c4
f807e7d
 
 
 
26f62c4
 
 
 
 
 
f807e7d
 
 
 
 
26f62c4
f807e7d
 
26f62c4
f807e7d
 
 
26f62c4
 
 
f807e7d
a2f42ca
f807e7d
26f62c4
 
 
 
 
 
f807e7d
 
 
 
26f62c4
f807e7d
 
26f62c4
f807e7d
26f62c4

import PyPDF2
import unicodedata
import re
import logging

from .gpt_processor import Translator


class PDFProcessor:
    def __init__(self, file_path: str) -> None:
        self.file_path = file_path
        self.file_info = {
            "file_name": self.file_path.split("/")[-1],
            "file_format": "PDF",
            "total_pages": 0,
            "file_content": {},
            "file_full_content": "",
            "is_chinese": "",
        }
        self.__build_info()

    def __build_info(self) -> None:
        try:
            with open(self.file_path, "rb") as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                pages = len(pdf_reader.pages)
                self.file_info["total_pages"] = pages
                for i, page in enumerate(pdf_reader.pages):
                    text = page.extract_text()
                    text = unicodedata.normalize("NFKD", text)
                    text = text.replace("\n", " ").replace("\r", "")
                    text = re.sub(" +", " ", text)
                    self.file_info["is_chinese"] = self.__is_chinese(text)

                    page_info = {}
                    logging.info(f"Processing page {i + 1}...")
                    page_info["page_num"] = i + 1
                    page_info["page_content"] = text
                    self.file_info["file_content"][i + 1] = page_info
                    self.file_info["file_full_content"] = (
                        self.file_info["file_full_content"] + page_info["page_content"]
                    )
        except FileNotFoundError:
            print(f"File not found: {self.file_path}")
        except Exception as e:
            print(f"An error occurred: {str(e)}")

    def __is_chinese(self, text: str) -> bool:
        for char in text:
            if char >= "\u4e00" and char <= "\u9fff":
                return True
        return False