import tempfile import os from pdf2image import convert_from_path from recognize_page import recognize_page from tqdm import tqdm def recognize_book(book_path: str, text_output_path: str): data = {} pages = convert_from_path(book_path) for i in tqdm(range(len(pages)), desc="Procces pdf"): with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file: # Сохранение страницы в временный файл pages[i].save(temp_file.name, 'PNG') page_text = recognize_page(temp_file.name) data[i] = page_text os.remove(temp_file.name) book_text = "" for i in data.keys(): book_text += f"\n\n=== Page {i+1} ===\n\n" book_text += data[i] + f"\n" with open(text_output_path, "w", encoding="utf-8") as text_file: text_file.write(book_text) if __name__ == "__main__": recognize_book(book_path="bv000030992_0001.pdf", text_output_path="book_text.txt")