trocr-prereform-orthography / recognize_book.py
Serovvans's picture
Upload 5 files
caa85f5 verified
import tempfile
import os
from pdf2image import convert_from_path
from recognize_page import recognize_page
from tqdm import tqdm
def recognize_book(book_path: str, text_output_path: str):
data = {}
pages = convert_from_path(book_path)
for i in tqdm(range(len(pages)), desc="Procces pdf"):
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
# Сохранение страницы в временный файл
pages[i].save(temp_file.name, 'PNG')
page_text = recognize_page(temp_file.name)
data[i] = page_text
os.remove(temp_file.name)
book_text = ""
for i in data.keys():
book_text += f"\n\n=== Page {i+1} ===\n\n"
book_text += data[i] + f"\n"
with open(text_output_path, "w", encoding="utf-8") as text_file:
text_file.write(book_text)
if __name__ == "__main__":
recognize_book(book_path="bv000030992_0001.pdf", text_output_path="book_text.txt")