File size: 1,039 Bytes
caa85f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import tempfile
import os
from pdf2image import convert_from_path
from recognize_page import recognize_page
from tqdm import tqdm

def recognize_book(book_path: str, text_output_path: str):
    data = {}
    
    pages = convert_from_path(book_path)

    for i in tqdm(range(len(pages)), desc="Procces pdf"):
        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
            # Сохранение страницы в временный файл
            pages[i].save(temp_file.name, 'PNG')
            page_text = recognize_page(temp_file.name)
            
            data[i] = page_text
            
        os.remove(temp_file.name)
    
    book_text = ""
    for i in data.keys():
        book_text += f"\n\n=== Page {i+1} ===\n\n"
        book_text += data[i] + f"\n"
        
    with open(text_output_path, "w", encoding="utf-8") as text_file:
        text_file.write(book_text)


if __name__ == "__main__":
    recognize_book(book_path="bv000030992_0001.pdf", text_output_path="book_text.txt")