from typing import List, Optional import numpy as np import pytesseract from pytesseract import Output from tqdm import tqdm from surya.input.processing import slice_bboxes_from_image from surya.settings import settings import os from concurrent.futures import ProcessPoolExecutor from surya.detection import get_batch_size as get_det_batch_size from surya.recognition import get_batch_size as get_rec_batch_size from surya.languages import CODE_TO_LANGUAGE def surya_lang_to_tesseract(code: str) -> Optional[str]: lang_str = CODE_TO_LANGUAGE[code] try: tess_lang = TESS_LANGUAGE_TO_CODE[lang_str] except KeyError: return None return tess_lang def tesseract_ocr(img, bboxes, lang: str): line_imgs = slice_bboxes_from_image(img, bboxes) config = f'--tessdata-dir "{settings.TESSDATA_PREFIX}"' lines = [] for line_img in line_imgs: line = pytesseract.image_to_string(line_img, lang=lang, config=config) lines.append(line) return lines def tesseract_ocr_parallel(imgs, bboxes, langs: List[str], cpus=None): tess_parallel_cores = min(len(imgs), get_rec_batch_size()) if not cpus: cpus = os.cpu_count() tess_parallel_cores = min(tess_parallel_cores, cpus) # Tesseract uses up to 4 processes per instance # Divide by 2 because tesseract doesn't seem to saturate all 4 cores with these small images tess_parallel = max(tess_parallel_cores // 2, 1) with ProcessPoolExecutor(max_workers=tess_parallel) as executor: tess_text = tqdm(executor.map(tesseract_ocr, imgs, bboxes, langs), total=len(imgs), desc="Running tesseract OCR") tess_text = list(tess_text) return tess_text def tesseract_bboxes(img): arr_img = np.asarray(img, dtype=np.uint8) ocr = pytesseract.image_to_data(arr_img, output_type=Output.DICT) bboxes = [] n_boxes = len(ocr['level']) for i in range(n_boxes): # It is possible to merge by line here with line number, but it gives bad results. _, x, y, w, h = ocr['text'][i], ocr['left'][i], ocr['top'][i], ocr['width'][i], ocr['height'][i] bbox = (x, y, x + w, y + h) bboxes.append(bbox) return bboxes def tesseract_parallel(imgs): # Tesseract uses 4 threads per instance tess_parallel_cores = min(len(imgs), get_det_batch_size()) cpus = os.cpu_count() tess_parallel_cores = min(tess_parallel_cores, cpus) # Tesseract uses 4 threads per instance tess_parallel = max(tess_parallel_cores // 4, 1) with ProcessPoolExecutor(max_workers=tess_parallel) as executor: tess_bboxes = tqdm(executor.map(tesseract_bboxes, imgs), total=len(imgs), desc="Running tesseract bbox detection") tess_bboxes = list(tess_bboxes) return tess_bboxes TESS_CODE_TO_LANGUAGE = { "afr": "Afrikaans", "amh": "Amharic", "ara": "Arabic", "asm": "Assamese", "aze": "Azerbaijani", "bel": "Belarusian", "ben": "Bengali", "bod": "Tibetan", "bos": "Bosnian", "bre": "Breton", "bul": "Bulgarian", "cat": "Catalan", "ceb": "Cebuano", "ces": "Czech", "chi_sim": "Chinese", "chr": "Cherokee", "cym": "Welsh", "dan": "Danish", "deu": "German", "dzo": "Dzongkha", "ell": "Greek", "eng": "English", "epo": "Esperanto", "est": "Estonian", "eus": "Basque", "fas": "Persian", "fin": "Finnish", "fra": "French", "fry": "Western Frisian", "guj": "Gujarati", "gla": "Scottish Gaelic", "gle": "Irish", "glg": "Galician", "heb": "Hebrew", "hin": "Hindi", "hrv": "Croatian", "hun": "Hungarian", "hye": "Armenian", "iku": "Inuktitut", "ind": "Indonesian", "isl": "Icelandic", "ita": "Italian", "jav": "Javanese", "jpn": "Japanese", "kan": "Kannada", "kat": "Georgian", "kaz": "Kazakh", "khm": "Khmer", "kir": "Kyrgyz", "kor": "Korean", "lao": "Lao", "lat": "Latin", "lav": "Latvian", "lit": "Lithuanian", "mal": "Malayalam", "mar": "Marathi", "mkd": "Macedonian", "mlt": "Maltese", "mon": "Mongolian", "msa": "Malay", "mya": "Burmese", "nep": "Nepali", "nld": "Dutch", "nor": "Norwegian", "ori": "Oriya", "pan": "Punjabi", "pol": "Polish", "por": "Portuguese", "pus": "Pashto", "ron": "Romanian", "rus": "Russian", "san": "Sanskrit", "sin": "Sinhala", "slk": "Slovak", "slv": "Slovenian", "snd": "Sindhi", "spa": "Spanish", "sqi": "Albanian", "srp": "Serbian", "swa": "Swahili", "swe": "Swedish", "syr": "Syriac", "tam": "Tamil", "tel": "Telugu", "tgk": "Tajik", "tha": "Thai", "tir": "Tigrinya", "tur": "Turkish", "uig": "Uyghur", "ukr": "Ukrainian", "urd": "Urdu", "uzb": "Uzbek", "vie": "Vietnamese", "yid": "Yiddish" } TESS_LANGUAGE_TO_CODE = {v:k for k,v in TESS_CODE_TO_LANGUAGE.items()}