from typing import List, Optional |
import numpy as np |
import pytesseract |
from pytesseract import Output |
from tqdm import tqdm |
from surya.input.processing import slice_bboxes_from_image |
from surya.settings import settings |
import os |
from concurrent.futures import ProcessPoolExecutor |
from surya.detection import get_batch_size as get_det_batch_size |
from surya.recognition import get_batch_size as get_rec_batch_size |
from surya.languages import CODE_TO_LANGUAGE |
def surya_lang_to_tesseract(code: str) -> Optional[str]: |
lang_str = CODE_TO_LANGUAGE[code] |
try: |
tess_lang = TESS_LANGUAGE_TO_CODE[lang_str] |
except KeyError: |
return None |
return tess_lang |
def tesseract_ocr(img, bboxes, lang: str): |
line_imgs = slice_bboxes_from_image(img, bboxes) |
config = f'--tessdata-dir "{settings.TESSDATA_PREFIX}"' |
lines = [] |
for line_img in line_imgs: |
line = pytesseract.image_to_string(line_img, lang=lang, config=config) |
lines.append(line) |
return lines |
def tesseract_ocr_parallel(imgs, bboxes, langs: List[str], cpus=None): |
tess_parallel_cores = min(len(imgs), get_rec_batch_size()) |
if not cpus: |
cpus = os.cpu_count() |
tess_parallel_cores = min(tess_parallel_cores, cpus) |
tess_parallel = max(tess_parallel_cores // 2, 1) |
with ProcessPoolExecutor(max_workers=tess_parallel) as executor: |
tess_text = tqdm(executor.map(tesseract_ocr, imgs, bboxes, langs), total=len(imgs), desc="Running tesseract OCR") |
tess_text = list(tess_text) |
return tess_text |
def tesseract_bboxes(img): |
arr_img = np.asarray(img, dtype=np.uint8) |
ocr = pytesseract.image_to_data(arr_img, output_type=Output.DICT) |
bboxes = [] |
n_boxes = len(ocr['level']) |
for i in range(n_boxes): |
_, x, y, w, h = ocr['text'][i], ocr['left'][i], ocr['top'][i], ocr['width'][i], ocr['height'][i] |
bbox = (x, y, x + w, y + h) |
bboxes.append(bbox) |
return bboxes |
def tesseract_parallel(imgs): |
tess_parallel_cores = min(len(imgs), get_det_batch_size()) |
cpus = os.cpu_count() |
tess_parallel_cores = min(tess_parallel_cores, cpus) |
tess_parallel = max(tess_parallel_cores // 4, 1) |
with ProcessPoolExecutor(max_workers=tess_parallel) as executor: |
tess_bboxes = tqdm(executor.map(tesseract_bboxes, imgs), total=len(imgs), desc="Running tesseract bbox detection") |
tess_bboxes = list(tess_bboxes) |
return tess_bboxes |
"afr": "Afrikaans", |
"amh": "Amharic", |
"ara": "Arabic", |
"asm": "Assamese", |
"aze": "Azerbaijani", |
"bel": "Belarusian", |
"ben": "Bengali", |
"bod": "Tibetan", |
"bos": "Bosnian", |
"bre": "Breton", |
"bul": "Bulgarian", |
"cat": "Catalan", |
"ceb": "Cebuano", |
"ces": "Czech", |
"chi_sim": "Chinese", |
"chr": "Cherokee", |
"cym": "Welsh", |
"dan": "Danish", |
"deu": "German", |
"dzo": "Dzongkha", |
"ell": "Greek", |
"eng": "English", |
"epo": "Esperanto", |
"est": "Estonian", |
"eus": "Basque", |
"fas": "Persian", |
"fin": "Finnish", |
"fra": "French", |
"fry": "Western Frisian", |
"guj": "Gujarati", |
"gla": "Scottish Gaelic", |
"gle": "Irish", |
"glg": "Galician", |
"heb": "Hebrew", |
"hin": "Hindi", |
"hrv": "Croatian", |
"hun": "Hungarian", |
"hye": "Armenian", |
"iku": "Inuktitut", |
"ind": "Indonesian", |
"isl": "Icelandic", |
"ita": "Italian", |
"jav": "Javanese", |
"jpn": "Japanese", |
"kan": "Kannada", |
"kat": "Georgian", |
"kaz": "Kazakh", |
"khm": "Khmer", |
"kir": "Kyrgyz", |
"kor": "Korean", |
"lao": "Lao", |
"lat": "Latin", |
"lav": "Latvian", |
"lit": "Lithuanian", |
"mal": "Malayalam", |
"mar": "Marathi", |
"mkd": "Macedonian", |
"mlt": "Maltese", |
"mon": "Mongolian", |
"msa": "Malay", |
"mya": "Burmese", |
"nep": "Nepali", |
"nld": "Dutch", |
"nor": "Norwegian", |
"ori": "Oriya", |
"pan": "Punjabi", |
"pol": "Polish", |
"por": "Portuguese", |
"pus": "Pashto", |
"ron": "Romanian", |
"rus": "Russian", |
"san": "Sanskrit", |
"sin": "Sinhala", |
"slk": "Slovak", |
"slv": "Slovenian", |
"snd": "Sindhi", |
"spa": "Spanish", |
"sqi": "Albanian", |
"srp": "Serbian", |
"swa": "Swahili", |
"swe": "Swedish", |
"syr": "Syriac", |
"tam": "Tamil", |
"tel": "Telugu", |
"tgk": "Tajik", |
"tha": "Thai", |
"tir": "Tigrinya", |
"tur": "Turkish", |
"uig": "Uyghur", |
"ukr": "Ukrainian", |
"urd": "Urdu", |
"uzb": "Uzbek", |
"vie": "Vietnamese", |
"yid": "Yiddish" |
} |
TESS_LANGUAGE_TO_CODE = {v:k for k,v in TESS_CODE_TO_LANGUAGE.items()} |