Upload 2 files
Browse files- recognize_page.py +99 -0
- requirements_page.txt +4 -0
recognize_page.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from PIL import Image
|
3 |
+
import pytesseract
|
4 |
+
from tqdm import tqdm
|
5 |
+
from pytesseract import Output
|
6 |
+
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
|
7 |
+
|
8 |
+
def recognize_row(row_file):
|
9 |
+
hf_model = VisionEncoderDecoderModel.from_pretrained("Serovvans/trocr-prereform-orthography")
|
10 |
+
image = Image.open(row_file)
|
11 |
+
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
|
12 |
+
pixel_values = processor(images=image, return_tensors="pt").pixel_values
|
13 |
+
generated_ids = hf_model.generate(pixel_values)
|
14 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
15 |
+
return generated_text
|
16 |
+
|
17 |
+
def recognize_page(image_path, output_dir="./", page_name=None):
|
18 |
+
"""
|
19 |
+
Разбивает изображение страницы на строки, сортирует строки, распознаёт их и соединяет текст.
|
20 |
+
|
21 |
+
Параметры:
|
22 |
+
image_path (str): Путь к изображению страницы.
|
23 |
+
output_dir (str): Путь к папке для сохранения строк.
|
24 |
+
page_name (str): Имя страницы для сохранения строк (по умолчанию None).
|
25 |
+
|
26 |
+
Возвращает:
|
27 |
+
str: Итоговый распознанный текст страницы.
|
28 |
+
"""
|
29 |
+
os.makedirs(output_dir, exist_ok=True)
|
30 |
+
image = Image.open(image_path)
|
31 |
+
data = pytesseract.image_to_data(image, config='--psm 3', output_type=Output.DICT)
|
32 |
+
|
33 |
+
lines = []
|
34 |
+
current_line = []
|
35 |
+
previous_y = None
|
36 |
+
y_threshold = 15 # Порог для объединения слов в одну строку
|
37 |
+
|
38 |
+
# Обход всех блоков текста
|
39 |
+
n_boxes = len(data['level'])
|
40 |
+
for i in range(n_boxes):
|
41 |
+
if data['level'][i] == 5 or data['level'][i] == 4: # Уровень строки или слова
|
42 |
+
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
|
43 |
+
text = data['text'][i].strip()
|
44 |
+
|
45 |
+
if not text:
|
46 |
+
continue
|
47 |
+
|
48 |
+
# Проверяем, начинается ли новая строка
|
49 |
+
if previous_y is None or abs(y - previous_y) > y_threshold:
|
50 |
+
if current_line:
|
51 |
+
# Объединяем слова в строку и добавляем в список строк
|
52 |
+
min_x = min([word['x'] for word in current_line])
|
53 |
+
max_x = max([word['x'] + word['w'] for word in current_line])
|
54 |
+
avg_y = sum([word['y'] for word in current_line]) / len(current_line)
|
55 |
+
max_y = max([word['y'] + word['h'] for word in current_line])
|
56 |
+
lines.append((min_x, avg_y, max_x - min_x, max_y - avg_y, current_line))
|
57 |
+
current_line = []
|
58 |
+
|
59 |
+
# Добавляем текущее слово в текущую строку
|
60 |
+
current_line.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
|
61 |
+
previous_y = y
|
62 |
+
|
63 |
+
# Добавляем последнюю строку
|
64 |
+
if current_line:
|
65 |
+
min_x = min([word['x'] for word in current_line])
|
66 |
+
max_x = max([word['x'] + word['w'] for word in current_line])
|
67 |
+
avg_y = sum([word['y'] for word in current_line]) / len(current_line)
|
68 |
+
max_y = max([word['y'] + word['h'] for word in current_line])
|
69 |
+
lines.append((min_x, avg_y, max_x - min_x, max_y - avg_y, current_line))
|
70 |
+
|
71 |
+
# Сортировка строк по координате Y
|
72 |
+
lines.sort(key=lambda line: line[1])
|
73 |
+
|
74 |
+
# Сохранение строк как изображений и распознавание текста
|
75 |
+
recognized_text = []
|
76 |
+
i = 0
|
77 |
+
for line in tqdm(lines, desc="Processing page"):
|
78 |
+
x, y, w, h, words = line
|
79 |
+
min_x = x
|
80 |
+
max_x = x + w
|
81 |
+
min_y = max(0, y - 10)
|
82 |
+
max_y = y + h
|
83 |
+
|
84 |
+
# Обрезаем изображение по координатам строки
|
85 |
+
row_image = image.crop((min_x, min_y, max_x, max_y))
|
86 |
+
row_image_path = os.path.join(output_dir, f'{page_name}_row_{i}.png')
|
87 |
+
row_image.save(row_image_path)
|
88 |
+
|
89 |
+
# Распознаём текст строки
|
90 |
+
row_text = recognize_row(row_image_path)
|
91 |
+
|
92 |
+
os.remove(row_image_path)
|
93 |
+
|
94 |
+
recognized_text.append(row_text)
|
95 |
+
i += 1
|
96 |
+
|
97 |
+
# Соединяем распознанный текст строк
|
98 |
+
full_text = ' '.join(recognized_text)
|
99 |
+
return full_text
|
requirements_page.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pytesseract==0.3.10
|
2 |
+
Pillow==10.0.0
|
3 |
+
transformers==4.33.2
|
4 |
+
torch==2.0.1
|