Serovvans commited on
Commit
caa85f5
·
verified ·
1 Parent(s): 96b12f5

Upload 5 files

Browse files
README.md CHANGED
@@ -26,10 +26,22 @@ print(generated_text)
26
  ```
27
  ## Usage for recognizing the page
28
  1. Download the following files from the repository: recognize_page.py, requirements_page.txt
29
- 2. Use this comand
30
- ```bash
31
- sudo apt install tesseract-ocr
32
- ```
 
 
 
 
 
 
 
 
 
 
 
 
33
  3. Install dependencies:
34
  ```bash
35
  pip install -r requirements_page.txt
@@ -39,12 +51,41 @@ pip install -r requirements_page.txt
39
  from recognize_page import recognize_page
40
 
41
  page_path = "cleaned_pages/C2V10/page11.png"
42
- text = recognize_page(page_path)
43
 
44
  print(f"Текст страницы:\n")
45
  print(text)
46
  ```
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  ## Metrics on test
49
  - CER (Char Error Rate) = 0.095
50
  - WER (Word Error Rate) = 0.298
 
26
  ```
27
  ## Usage for recognizing the page
28
  1. Download the following files from the repository: recognize_page.py, requirements_page.txt
29
+ 2. Install tesseract-ocr
30
+ - Linux
31
+ ```bash
32
+ sudo apt install tesseract-ocr
33
+ ```
34
+ - MacOS
35
+ ```bash
36
+ brew install tesseract-ocr
37
+ ```
38
+ - Windows
39
+ 1. Download tesseract exe from https://github.com/UB-Mannheim/tesseract/wiki.
40
+
41
+ 2. Install this exe in C:\Program Files (x86)\Tesseract-OCR
42
+
43
+ 3. Open virtual machine command prompt in windows or anaconda prompt.
44
+
45
  3. Install dependencies:
46
  ```bash
47
  pip install -r requirements_page.txt
 
51
  from recognize_page import recognize_page
52
 
53
  page_path = "cleaned_pages/C2V10/page11.png"
54
+ text = recognize_page(page_path, text_output_path="output/file.txt")
55
 
56
  print(f"Текст страницы:\n")
57
  print(text)
58
  ```
59
 
60
+ ## Usage for recognizing the book from pdf-file
61
+ 1. Download the following files from the repository: recognize_book.py, recognize_page.py, requirements_book.txt
62
+ 2. Install tesseract-ocr
63
+ - Linux
64
+ ```bash
65
+ sudo apt install tesseract-ocr
66
+ ```
67
+ - MacOS
68
+ ```bash
69
+ brew install tesseract-ocr
70
+ ```
71
+ - Windows
72
+ 1. Download tesseract exe from https://github.com/UB-Mannheim/tesseract/wiki.
73
+
74
+ 2. Install this exe in C:\Program Files (x86)\Tesseract-OCR
75
+
76
+ 3. Open virtual machine command prompt in windows or anaconda prompt.
77
+
78
+ 3. Install dependencies:
79
+ ```bash
80
+ pip install -r requirements_book.txt
81
+ ```
82
+ 4. Use this code
83
+ ```python
84
+ from recognize_book import recognize_book
85
+
86
+ recognize_book(book_path="path/to/your/book.pdf", text_output_path="book_text.txt")
87
+ ```
88
+
89
  ## Metrics on test
90
  - CER (Char Error Rate) = 0.095
91
  - WER (Word Error Rate) = 0.298
recognize_book.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import os
3
+ from pdf2image import convert_from_path
4
+ from recognize_page import recognize_page
5
+ from tqdm import tqdm
6
+
7
+ def recognize_book(book_path: str, text_output_path: str):
8
+ data = {}
9
+
10
+ pages = convert_from_path(book_path)
11
+
12
+ for i in tqdm(range(len(pages)), desc="Procces pdf"):
13
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
14
+ # Сохранение страницы в временный файл
15
+ pages[i].save(temp_file.name, 'PNG')
16
+ page_text = recognize_page(temp_file.name)
17
+
18
+ data[i] = page_text
19
+
20
+ os.remove(temp_file.name)
21
+
22
+ book_text = ""
23
+ for i in data.keys():
24
+ book_text += f"\n\n=== Page {i+1} ===\n\n"
25
+ book_text += data[i] + f"\n"
26
+
27
+ with open(text_output_path, "w", encoding="utf-8") as text_file:
28
+ text_file.write(book_text)
29
+
30
+
31
+ if __name__ == "__main__":
32
+ recognize_book(book_path="bv000030992_0001.pdf", text_output_path="book_text.txt")
recognize_page.py CHANGED
@@ -1,99 +1,112 @@
1
- import os
2
- from PIL import Image
3
  import pytesseract
4
- from tqdm import tqdm
5
- from pytesseract import Output
 
6
  from transformers import VisionEncoderDecoderModel, TrOCRProcessor
7
 
8
- def recognize_row(row_file):
9
- hf_model = VisionEncoderDecoderModel.from_pretrained("Serovvans/trocr-prereform-orthography")
10
- image = Image.open(row_file)
11
- processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  pixel_values = processor(images=image, return_tensors="pt").pixel_values
13
  generated_ids = hf_model.generate(pixel_values)
14
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
15
  return generated_text
16
 
17
- def recognize_page(image_path, output_dir="./", page_name=None):
18
- """
19
- Разбивает изображение страницы на строки, сортирует строки, распознаёт их и соединяет текст.
20
-
21
- Параметры:
22
- image_path (str): Путь к изображению страницы.
23
- output_dir (str): Путь к папке для сохранения строк.
24
- page_name (str): Имя страницы для сохранения строк (по умолчанию None).
25
 
26
- Возвращает:
27
- str: Итоговый распознанный текст страницы.
28
- """
29
- os.makedirs(output_dir, exist_ok=True)
30
- image = Image.open(image_path)
31
- data = pytesseract.image_to_data(image, config='--psm 3', output_type=Output.DICT)
32
-
33
- lines = []
34
- current_line = []
35
- previous_y = None
36
- y_threshold = 15 # Порог для объединения слов в одну строку
37
-
38
- # Обход всех блоков текста
39
- n_boxes = len(data['level'])
40
- for i in range(n_boxes):
41
- if data['level'][i] == 5 or data['level'][i] == 4: # Уровень строки или слова
42
  x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
43
- text = data['text'][i].strip()
44
-
45
- if not text:
46
- continue
47
-
48
- # Проверяем, начинается ли новая строка
49
- if previous_y is None or abs(y - previous_y) > y_threshold:
50
- if current_line:
51
- # Объединяем слова в строку и добавляем в список строк
52
- min_x = min([word['x'] for word in current_line])
53
- max_x = max([word['x'] + word['w'] for word in current_line])
54
- avg_y = sum([word['y'] for word in current_line]) / len(current_line)
55
- max_y = max([word['y'] + word['h'] for word in current_line])
56
- lines.append((min_x, avg_y, max_x - min_x, max_y - avg_y, current_line))
57
- current_line = []
58
-
59
- # Добавляем текущее слово в текущую строку
60
- current_line.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
61
- previous_y = y
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Добавляем последнюю строку
64
  if current_line:
65
- min_x = min([word['x'] for word in current_line])
66
- max_x = max([word['x'] + word['w'] for word in current_line])
67
- avg_y = sum([word['y'] for word in current_line]) / len(current_line)
68
- max_y = max([word['y'] + word['h'] for word in current_line])
69
- lines.append((min_x, avg_y, max_x - min_x, max_y - avg_y, current_line))
70
-
71
- # Сортировка строк по координате Y
72
- lines.sort(key=lambda line: line[1])
73
-
74
- # Сохранение строк как изображений и распознавание текста
75
- recognized_text = []
76
- i = 0
77
- for line in tqdm(lines, desc="Processing page"):
78
- x, y, w, h, words = line
79
- min_x = x
80
- max_x = x + w
81
- min_y = max(0, y - 10)
82
- max_y = y + h
83
-
84
- # Обрезаем изображение по координатам строки
85
- row_image = image.crop((min_x, min_y, max_x, max_y))
86
- row_image_path = os.path.join(output_dir, f'{page_name}_row_{i}.png')
87
- row_image.save(row_image_path)
88
-
89
- # Распознаём текст строки
90
- row_text = recognize_row(row_image_path)
91
-
92
- os.remove(row_image_path)
93
-
94
- recognized_text.append(row_text)
95
- i += 1
96
-
97
- # Соединяем распознанный текст строк
98
- full_text = ' '.join(recognized_text)
99
- return full_text
 
 
 
1
  import pytesseract
2
+ import json
3
+ import numpy as np
4
+ from PIL import Image, ImageEnhance
5
  from transformers import VisionEncoderDecoderModel, TrOCRProcessor
6
 
7
+
8
+ hf_model = VisionEncoderDecoderModel.from_pretrained("Serovvans/trocr-prereform-orthography")
9
+ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
10
+
11
+ def remove_bleed_through(image_path,
12
+ brightness_factor=1.5):
13
+ # Загрузка изображения
14
+ pil_image = Image.open(image_path).convert('RGB')
15
+ img = np.array(pil_image)
16
+
17
+ alpha = 1.7
18
+ beta = -130
19
+
20
+ result = alpha * img + beta
21
+ result = np.clip(result, 0, 255).astype(np.uint8)
22
+
23
+ # Преобразование в PIL Image
24
+ pil_result = Image.fromarray(result)
25
+
26
+ # 2. Повышение яркости
27
+ enhancer_brightness = ImageEnhance.Brightness(pil_result)
28
+ bright_image = enhancer_brightness.enhance(brightness_factor)
29
+
30
+ return bright_image
31
+
32
+ # Функция распознавания текста с TrOCR
33
+ def recognize_row(image):
34
+ image = image.convert("RGB")
35
  pixel_values = processor(images=image, return_tensors="pt").pixel_values
36
  generated_ids = hf_model.generate(pixel_values)
37
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
38
  return generated_text
39
 
 
 
 
 
 
 
 
 
40
 
41
+ # Основная функция распознавания текста с разделением на слова/фразы
42
+ def recognize_page(image_path, text_output_path=False):
43
+ # Открываем изображение
44
+ image = remove_bleed_through(image_path)
45
+
46
+ # Используем pytesseract для получения данных по каждому фрагменту
47
+ data = pytesseract.image_to_data(image, config="--psm 6", output_type=pytesseract.Output.DICT, lang='ukr+eng')
48
+ with open("rec_data.json", "w", encoding="utf-8") as json_file:
49
+ json.dump(data, json_file)
50
+
51
+ pad = int(0.0042 * image.size[1])
52
+ # Сохраняем результаты для последующего восстановления порядка
53
+ fragments = []
54
+ for i in range(len(data['text'])):
55
+ if data['conf'][i] > -1 and data['width'][i]*data['height'][i] > pad**2: # Игнорируем пустые фрагменты
 
56
  x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
57
+ fragment_image = image.crop((x-pad, y-pad, x + w + pad, y + h + pad))
58
+ text = recognize_row(fragment_image).strip()
59
+
60
+ if data['text'][i].strip() == "\u2014":
61
+ text = "\u2014"
62
+
63
+ try:
64
+ num = int(data['text'][i].strip())
65
+ if data['conf'][i] > 85:
66
+ text = data['text'][i].strip()
67
+ except:
68
+ pass
69
+ fragments.append({
70
+ 'block_num': data['block_num'][i],
71
+ 'par_num': data['par_num'][i],
72
+ 'line_num': data['line_num'][i],
73
+ 'word_num': data['word_num'][i],
74
+ 'text': text,
75
+ 'image': fragment_image
76
+ })
77
 
78
+ # Сортируем фрагменты по line_num и word_num
79
+ fragments = sorted(fragments, key=lambda x: (x['block_num'], x['par_num'], x['line_num'], x['word_num']))
80
+
81
+ # Распознаем текст из каждого фрагмента
82
+ result_lines = []
83
+ current_line_num = 0
84
+ curr_block_num = 0
85
+ curr_par_num = 0
86
+ current_line = []
87
+
88
+ for fragment in fragments:
89
+ if fragment['line_num'] != current_line_num or fragment['block_num'] != curr_block_num or fragment['par_num'] != curr_par_num:
90
+ # Завершаем текущую строку и переходим к следующей
91
+ result_lines.append(" ".join(current_line))
92
+ current_line = []
93
+ current_line_num = fragment['line_num']
94
+ curr_block_num = fragment['block_num']
95
+ curr_par_num = fragment['par_num']
96
+
97
+ # Распознаем текст фрагмента
98
+ recognized_text = fragment['text']
99
+ current_line.append(recognized_text)
100
+
101
  # Добавляем последнюю строку
102
  if current_line:
103
+ result_lines.append(" ".join(current_line))
104
+
105
+ # Собираем результат в общий текст
106
+ final_text = "\n".join(result_lines)
107
+
108
+ if text_output_path:
109
+ with open(text_output_path, "w", encoding="utf-8") as text_file:
110
+ text_file.write(final_text)
111
+
112
+ return final_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements_book.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy==1.23.0
2
+ pdf2image==1.17.0
3
+ pytesseract==0.3.10
4
+ Pillow==10.0.0
5
+ transformers==4.33.2
6
+ torch==2.0.1
7
+ tqdm
requirements_page.txt CHANGED
@@ -1,4 +1,6 @@
1
  pytesseract==0.3.10
2
  Pillow==10.0.0
3
  transformers==4.33.2
4
- torch==2.0.1
 
 
 
1
  pytesseract==0.3.10
2
  Pillow==10.0.0
3
  transformers==4.33.2
4
+ torch==2.0.1
5
+ numpy==1.23.0
6
+ tqdm