|
import tempfile |
|
import os |
|
from pdf2image import convert_from_path |
|
from recognize_page import recognize_page |
|
from tqdm import tqdm |
|
|
|
def recognize_book(book_path: str, text_output_path: str): |
|
data = {} |
|
|
|
pages = convert_from_path(book_path) |
|
|
|
for i in tqdm(range(len(pages)), desc="Procces pdf"): |
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file: |
|
|
|
pages[i].save(temp_file.name, 'PNG') |
|
page_text = recognize_page(temp_file.name) |
|
|
|
data[i] = page_text |
|
|
|
os.remove(temp_file.name) |
|
|
|
book_text = "" |
|
for i in data.keys(): |
|
book_text += f"\n\n=== Page {i+1} ===\n\n" |
|
book_text += data[i] + f"\n" |
|
|
|
with open(text_output_path, "w", encoding="utf-8") as text_file: |
|
text_file.write(book_text) |
|
|
|
|
|
if __name__ == "__main__": |
|
recognize_book(book_path="bv000030992_0001.pdf", text_output_path="book_text.txt") |
|
|