Upload 5 files
Browse files- README.md +46 -5
- recognize_book.py +32 -0
- recognize_page.py +99 -86
- requirements_book.txt +7 -0
- requirements_page.txt +3 -1
README.md
CHANGED
@@ -26,10 +26,22 @@ print(generated_text)
|
|
26 |
```
|
27 |
## Usage for recognizing the page
|
28 |
1. Download the following files from the repository: recognize_page.py, requirements_page.txt
|
29 |
-
2.
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
3. Install dependencies:
|
34 |
```bash
|
35 |
pip install -r requirements_page.txt
|
@@ -39,12 +51,41 @@ pip install -r requirements_page.txt
|
|
39 |
from recognize_page import recognize_page
|
40 |
|
41 |
page_path = "cleaned_pages/C2V10/page11.png"
|
42 |
-
text = recognize_page(page_path)
|
43 |
|
44 |
print(f"Текст страницы:\n")
|
45 |
print(text)
|
46 |
```
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
## Metrics on test
|
49 |
- CER (Char Error Rate) = 0.095
|
50 |
- WER (Word Error Rate) = 0.298
|
|
|
26 |
```
|
27 |
## Usage for recognizing the page
|
28 |
1. Download the following files from the repository: recognize_page.py, requirements_page.txt
|
29 |
+
2. Install tesseract-ocr
|
30 |
+
- Linux
|
31 |
+
```bash
|
32 |
+
sudo apt install tesseract-ocr
|
33 |
+
```
|
34 |
+
- MacOS
|
35 |
+
```bash
|
36 |
+
brew install tesseract-ocr
|
37 |
+
```
|
38 |
+
- Windows
|
39 |
+
1. Download tesseract exe from https://github.com/UB-Mannheim/tesseract/wiki.
|
40 |
+
|
41 |
+
2. Install this exe in C:\Program Files (x86)\Tesseract-OCR
|
42 |
+
|
43 |
+
3. Open virtual machine command prompt in windows or anaconda prompt.
|
44 |
+
|
45 |
3. Install dependencies:
|
46 |
```bash
|
47 |
pip install -r requirements_page.txt
|
|
|
51 |
from recognize_page import recognize_page
|
52 |
|
53 |
page_path = "cleaned_pages/C2V10/page11.png"
|
54 |
+
text = recognize_page(page_path, text_output_path="output/file.txt")
|
55 |
|
56 |
print(f"Текст страницы:\n")
|
57 |
print(text)
|
58 |
```
|
59 |
|
60 |
+
## Usage for recognizing the book from pdf-file
|
61 |
+
1. Download the following files from the repository: recognize_book.py, recognize_page.py, requirements_book.txt
|
62 |
+
2. Install tesseract-ocr
|
63 |
+
- Linux
|
64 |
+
```bash
|
65 |
+
sudo apt install tesseract-ocr
|
66 |
+
```
|
67 |
+
- MacOS
|
68 |
+
```bash
|
69 |
+
brew install tesseract-ocr
|
70 |
+
```
|
71 |
+
- Windows
|
72 |
+
1. Download tesseract exe from https://github.com/UB-Mannheim/tesseract/wiki.
|
73 |
+
|
74 |
+
2. Install this exe in C:\Program Files (x86)\Tesseract-OCR
|
75 |
+
|
76 |
+
3. Open virtual machine command prompt in windows or anaconda prompt.
|
77 |
+
|
78 |
+
3. Install dependencies:
|
79 |
+
```bash
|
80 |
+
pip install -r requirements_book.txt
|
81 |
+
```
|
82 |
+
4. Use this code
|
83 |
+
```python
|
84 |
+
from recognize_book import recognize_book
|
85 |
+
|
86 |
+
recognize_book(book_path="path/to/your/book.pdf", text_output_path="book_text.txt")
|
87 |
+
```
|
88 |
+
|
89 |
## Metrics on test
|
90 |
- CER (Char Error Rate) = 0.095
|
91 |
- WER (Word Error Rate) = 0.298
|
recognize_book.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
import os
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
from recognize_page import recognize_page
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
def recognize_book(book_path: str, text_output_path: str):
|
8 |
+
data = {}
|
9 |
+
|
10 |
+
pages = convert_from_path(book_path)
|
11 |
+
|
12 |
+
for i in tqdm(range(len(pages)), desc="Procces pdf"):
|
13 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
|
14 |
+
# Сохранение страницы в временный файл
|
15 |
+
pages[i].save(temp_file.name, 'PNG')
|
16 |
+
page_text = recognize_page(temp_file.name)
|
17 |
+
|
18 |
+
data[i] = page_text
|
19 |
+
|
20 |
+
os.remove(temp_file.name)
|
21 |
+
|
22 |
+
book_text = ""
|
23 |
+
for i in data.keys():
|
24 |
+
book_text += f"\n\n=== Page {i+1} ===\n\n"
|
25 |
+
book_text += data[i] + f"\n"
|
26 |
+
|
27 |
+
with open(text_output_path, "w", encoding="utf-8") as text_file:
|
28 |
+
text_file.write(book_text)
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
recognize_book(book_path="bv000030992_0001.pdf", text_output_path="book_text.txt")
|
recognize_page.py
CHANGED
@@ -1,99 +1,112 @@
|
|
1 |
-
import os
|
2 |
-
from PIL import Image
|
3 |
import pytesseract
|
4 |
-
|
5 |
-
|
|
|
6 |
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
pixel_values = processor(images=image, return_tensors="pt").pixel_values
|
13 |
generated_ids = hf_model.generate(pixel_values)
|
14 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
15 |
return generated_text
|
16 |
|
17 |
-
def recognize_page(image_path, output_dir="./", page_name=None):
|
18 |
-
"""
|
19 |
-
Разбивает изображение страницы на строки, сортирует строки, распознаёт их и соединяет текст.
|
20 |
-
|
21 |
-
Параметры:
|
22 |
-
image_path (str): Путь к изображению страницы.
|
23 |
-
output_dir (str): Путь к папке для сохранения строк.
|
24 |
-
page_name (str): Имя страницы для сохранения строк (по умолчанию None).
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
if data['level'][i] == 5 or data['level'][i] == 4: # Уровень строки или слова
|
42 |
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# Добавляем последнюю строку
|
64 |
if current_line:
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
recognized_text = []
|
76 |
-
i = 0
|
77 |
-
for line in tqdm(lines, desc="Processing page"):
|
78 |
-
x, y, w, h, words = line
|
79 |
-
min_x = x
|
80 |
-
max_x = x + w
|
81 |
-
min_y = max(0, y - 10)
|
82 |
-
max_y = y + h
|
83 |
-
|
84 |
-
# Обрезаем изображение по координатам строки
|
85 |
-
row_image = image.crop((min_x, min_y, max_x, max_y))
|
86 |
-
row_image_path = os.path.join(output_dir, f'{page_name}_row_{i}.png')
|
87 |
-
row_image.save(row_image_path)
|
88 |
-
|
89 |
-
# Распознаём текст строки
|
90 |
-
row_text = recognize_row(row_image_path)
|
91 |
-
|
92 |
-
os.remove(row_image_path)
|
93 |
-
|
94 |
-
recognized_text.append(row_text)
|
95 |
-
i += 1
|
96 |
-
|
97 |
-
# Соединяем распознанный текст строк
|
98 |
-
full_text = ' '.join(recognized_text)
|
99 |
-
return full_text
|
|
|
|
|
|
|
1 |
import pytesseract
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image, ImageEnhance
|
5 |
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
|
6 |
|
7 |
+
|
8 |
+
hf_model = VisionEncoderDecoderModel.from_pretrained("Serovvans/trocr-prereform-orthography")
|
9 |
+
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
|
10 |
+
|
11 |
+
def remove_bleed_through(image_path,
|
12 |
+
brightness_factor=1.5):
|
13 |
+
# Загрузка изображения
|
14 |
+
pil_image = Image.open(image_path).convert('RGB')
|
15 |
+
img = np.array(pil_image)
|
16 |
+
|
17 |
+
alpha = 1.7
|
18 |
+
beta = -130
|
19 |
+
|
20 |
+
result = alpha * img + beta
|
21 |
+
result = np.clip(result, 0, 255).astype(np.uint8)
|
22 |
+
|
23 |
+
# Преобразование в PIL Image
|
24 |
+
pil_result = Image.fromarray(result)
|
25 |
+
|
26 |
+
# 2. Повышение яркости
|
27 |
+
enhancer_brightness = ImageEnhance.Brightness(pil_result)
|
28 |
+
bright_image = enhancer_brightness.enhance(brightness_factor)
|
29 |
+
|
30 |
+
return bright_image
|
31 |
+
|
32 |
+
# Функция распознавания текста с TrOCR
|
33 |
+
def recognize_row(image):
|
34 |
+
image = image.convert("RGB")
|
35 |
pixel_values = processor(images=image, return_tensors="pt").pixel_values
|
36 |
generated_ids = hf_model.generate(pixel_values)
|
37 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
38 |
return generated_text
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Основная функция распознавания текста с разделением на слова/фразы
|
42 |
+
def recognize_page(image_path, text_output_path=False):
|
43 |
+
# Открываем изображение
|
44 |
+
image = remove_bleed_through(image_path)
|
45 |
+
|
46 |
+
# Используем pytesseract для получения данных по каждому фрагменту
|
47 |
+
data = pytesseract.image_to_data(image, config="--psm 6", output_type=pytesseract.Output.DICT, lang='ukr+eng')
|
48 |
+
with open("rec_data.json", "w", encoding="utf-8") as json_file:
|
49 |
+
json.dump(data, json_file)
|
50 |
+
|
51 |
+
pad = int(0.0042 * image.size[1])
|
52 |
+
# Сохраняем результаты для последующего восстановления порядка
|
53 |
+
fragments = []
|
54 |
+
for i in range(len(data['text'])):
|
55 |
+
if data['conf'][i] > -1 and data['width'][i]*data['height'][i] > pad**2: # Игнорируем пустые фрагменты
|
|
|
56 |
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
|
57 |
+
fragment_image = image.crop((x-pad, y-pad, x + w + pad, y + h + pad))
|
58 |
+
text = recognize_row(fragment_image).strip()
|
59 |
+
|
60 |
+
if data['text'][i].strip() == "\u2014":
|
61 |
+
text = "\u2014"
|
62 |
+
|
63 |
+
try:
|
64 |
+
num = int(data['text'][i].strip())
|
65 |
+
if data['conf'][i] > 85:
|
66 |
+
text = data['text'][i].strip()
|
67 |
+
except:
|
68 |
+
pass
|
69 |
+
fragments.append({
|
70 |
+
'block_num': data['block_num'][i],
|
71 |
+
'par_num': data['par_num'][i],
|
72 |
+
'line_num': data['line_num'][i],
|
73 |
+
'word_num': data['word_num'][i],
|
74 |
+
'text': text,
|
75 |
+
'image': fragment_image
|
76 |
+
})
|
77 |
|
78 |
+
# Сортируем фрагменты по line_num и word_num
|
79 |
+
fragments = sorted(fragments, key=lambda x: (x['block_num'], x['par_num'], x['line_num'], x['word_num']))
|
80 |
+
|
81 |
+
# Распознаем текст из каждого фрагмента
|
82 |
+
result_lines = []
|
83 |
+
current_line_num = 0
|
84 |
+
curr_block_num = 0
|
85 |
+
curr_par_num = 0
|
86 |
+
current_line = []
|
87 |
+
|
88 |
+
for fragment in fragments:
|
89 |
+
if fragment['line_num'] != current_line_num or fragment['block_num'] != curr_block_num or fragment['par_num'] != curr_par_num:
|
90 |
+
# Завершаем текущую строку и переходим к следующей
|
91 |
+
result_lines.append(" ".join(current_line))
|
92 |
+
current_line = []
|
93 |
+
current_line_num = fragment['line_num']
|
94 |
+
curr_block_num = fragment['block_num']
|
95 |
+
curr_par_num = fragment['par_num']
|
96 |
+
|
97 |
+
# Распознаем текст фрагмента
|
98 |
+
recognized_text = fragment['text']
|
99 |
+
current_line.append(recognized_text)
|
100 |
+
|
101 |
# Добавляем последнюю строку
|
102 |
if current_line:
|
103 |
+
result_lines.append(" ".join(current_line))
|
104 |
+
|
105 |
+
# Собираем результат в общий текст
|
106 |
+
final_text = "\n".join(result_lines)
|
107 |
+
|
108 |
+
if text_output_path:
|
109 |
+
with open(text_output_path, "w", encoding="utf-8") as text_file:
|
110 |
+
text_file.write(final_text)
|
111 |
+
|
112 |
+
return final_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements_book.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.23.0
|
2 |
+
pdf2image==1.17.0
|
3 |
+
pytesseract==0.3.10
|
4 |
+
Pillow==10.0.0
|
5 |
+
transformers==4.33.2
|
6 |
+
torch==2.0.1
|
7 |
+
tqdm
|
requirements_page.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
pytesseract==0.3.10
|
2 |
Pillow==10.0.0
|
3 |
transformers==4.33.2
|
4 |
-
torch==2.0.1
|
|
|
|
|
|
1 |
pytesseract==0.3.10
|
2 |
Pillow==10.0.0
|
3 |
transformers==4.33.2
|
4 |
+
torch==2.0.1
|
5 |
+
numpy==1.23.0
|
6 |
+
tqdm
|