trocr-prereform-orthography / recognize_page.py

Upload 2 files

d83150d verified about 1 month ago

4.52 kB

	import os
	from PIL import Image
	import pytesseract
	from tqdm import tqdm
	from pytesseract import Output
	from transformers import VisionEncoderDecoderModel, TrOCRProcessor

	def recognize_row(row_file):
	hf_model = VisionEncoderDecoderModel.from_pretrained("Serovvans/trocr-prereform-orthography")
	image = Image.open(row_file)
	processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
	pixel_values = processor(images=image, return_tensors="pt").pixel_values
	generated_ids = hf_model.generate(pixel_values)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return generated_text

	def recognize_page(image_path, output_dir="./", page_name=None):
	"""
	Разбивает изображение страницы на строки, сортирует строки, распознаёт их и соединяет текст.

	Параметры:
	image_path (str): Путь к изображению страницы.
	output_dir (str): Путь к папке для сохранения строк.
	page_name (str): Имя страницы для сохранения строк (по умолчанию None).

	Возвращает:
	str: Итоговый распознанный текст страницы.
	"""
	os.makedirs(output_dir, exist_ok=True)
	image = Image.open(image_path)
	data = pytesseract.image_to_data(image, config='--psm 3', output_type=Output.DICT)

	lines = []
	current_line = []
	previous_y = None
	y_threshold = 15 # Порог для объединения слов в одну строку

	# Обход всех блоков текста
	n_boxes = len(data['level'])
	for i in range(n_boxes):
	if data['level'][i] == 5 or data['level'][i] == 4: # Уровень строки или слова
	x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
	text = data['text'][i].strip()

	if not text:
	continue

	# Проверяем, начинается ли новая строка
	if previous_y is None or abs(y - previous_y) > y_threshold:
	if current_line:
	# Объединяем слова в строку и добавляем в список строк
	min_x = min([word['x'] for word in current_line])
	max_x = max([word['x'] + word['w'] for word in current_line])
	avg_y = sum([word['y'] for word in current_line]) / len(current_line)
	max_y = max([word['y'] + word['h'] for word in current_line])
	lines.append((min_x, avg_y, max_x - min_x, max_y - avg_y, current_line))
	current_line = []

	# Добавляем текущее слово в текущую строку
	current_line.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
	previous_y = y

	# Добавляем последнюю строку
	if current_line:
	min_x = min([word['x'] for word in current_line])
	max_x = max([word['x'] + word['w'] for word in current_line])
	avg_y = sum([word['y'] for word in current_line]) / len(current_line)
	max_y = max([word['y'] + word['h'] for word in current_line])
	lines.append((min_x, avg_y, max_x - min_x, max_y - avg_y, current_line))

	# Сортировка строк по координате Y
	lines.sort(key=lambda line: line[1])

	# Сохранение строк как изображений и распознавание текста
	recognized_text = []
	i = 0
	for line in tqdm(lines, desc="Processing page"):
	x, y, w, h, words = line
	min_x = x
	max_x = x + w
	min_y = max(0, y - 10)
	max_y = y + h

	# Обрезаем изображение по координатам строки
	row_image = image.crop((min_x, min_y, max_x, max_y))
	row_image_path = os.path.join(output_dir, f'{page_name}_row_{i}.png')
	row_image.save(row_image_path)

	# Распознаём текст строки
	row_text = recognize_row(row_image_path)

	os.remove(row_image_path)

	recognized_text.append(row_text)
	i += 1

	# Соединяем распознанный текст строк
	full_text = ' '.join(recognized_text)
	return full_text