ketanmore's picture
Upload folder using huggingface_hub
2720487 verified
from typing import List, Optional
import numpy as np
import pytesseract
from pytesseract import Output
from tqdm import tqdm
from surya.input.processing import slice_bboxes_from_image
from surya.settings import settings
import os
from concurrent.futures import ProcessPoolExecutor
from surya.detection import get_batch_size as get_det_batch_size
from surya.recognition import get_batch_size as get_rec_batch_size
from surya.languages import CODE_TO_LANGUAGE
def surya_lang_to_tesseract(code: str) -> Optional[str]:
lang_str = CODE_TO_LANGUAGE[code]
try:
tess_lang = TESS_LANGUAGE_TO_CODE[lang_str]
except KeyError:
return None
return tess_lang
def tesseract_ocr(img, bboxes, lang: str):
line_imgs = slice_bboxes_from_image(img, bboxes)
config = f'--tessdata-dir "{settings.TESSDATA_PREFIX}"'
lines = []
for line_img in line_imgs:
line = pytesseract.image_to_string(line_img, lang=lang, config=config)
lines.append(line)
return lines
def tesseract_ocr_parallel(imgs, bboxes, langs: List[str], cpus=None):
tess_parallel_cores = min(len(imgs), get_rec_batch_size())
if not cpus:
cpus = os.cpu_count()
tess_parallel_cores = min(tess_parallel_cores, cpus)
# Tesseract uses up to 4 processes per instance
# Divide by 2 because tesseract doesn't seem to saturate all 4 cores with these small images
tess_parallel = max(tess_parallel_cores // 2, 1)
with ProcessPoolExecutor(max_workers=tess_parallel) as executor:
tess_text = tqdm(executor.map(tesseract_ocr, imgs, bboxes, langs), total=len(imgs), desc="Running tesseract OCR")
tess_text = list(tess_text)
return tess_text
def tesseract_bboxes(img):
arr_img = np.asarray(img, dtype=np.uint8)
ocr = pytesseract.image_to_data(arr_img, output_type=Output.DICT)
bboxes = []
n_boxes = len(ocr['level'])
for i in range(n_boxes):
# It is possible to merge by line here with line number, but it gives bad results.
_, x, y, w, h = ocr['text'][i], ocr['left'][i], ocr['top'][i], ocr['width'][i], ocr['height'][i]
bbox = (x, y, x + w, y + h)
bboxes.append(bbox)
return bboxes
def tesseract_parallel(imgs):
# Tesseract uses 4 threads per instance
tess_parallel_cores = min(len(imgs), get_det_batch_size())
cpus = os.cpu_count()
tess_parallel_cores = min(tess_parallel_cores, cpus)
# Tesseract uses 4 threads per instance
tess_parallel = max(tess_parallel_cores // 4, 1)
with ProcessPoolExecutor(max_workers=tess_parallel) as executor:
tess_bboxes = tqdm(executor.map(tesseract_bboxes, imgs), total=len(imgs), desc="Running tesseract bbox detection")
tess_bboxes = list(tess_bboxes)
return tess_bboxes
TESS_CODE_TO_LANGUAGE = {
"afr": "Afrikaans",
"amh": "Amharic",
"ara": "Arabic",
"asm": "Assamese",
"aze": "Azerbaijani",
"bel": "Belarusian",
"ben": "Bengali",
"bod": "Tibetan",
"bos": "Bosnian",
"bre": "Breton",
"bul": "Bulgarian",
"cat": "Catalan",
"ceb": "Cebuano",
"ces": "Czech",
"chi_sim": "Chinese",
"chr": "Cherokee",
"cym": "Welsh",
"dan": "Danish",
"deu": "German",
"dzo": "Dzongkha",
"ell": "Greek",
"eng": "English",
"epo": "Esperanto",
"est": "Estonian",
"eus": "Basque",
"fas": "Persian",
"fin": "Finnish",
"fra": "French",
"fry": "Western Frisian",
"guj": "Gujarati",
"gla": "Scottish Gaelic",
"gle": "Irish",
"glg": "Galician",
"heb": "Hebrew",
"hin": "Hindi",
"hrv": "Croatian",
"hun": "Hungarian",
"hye": "Armenian",
"iku": "Inuktitut",
"ind": "Indonesian",
"isl": "Icelandic",
"ita": "Italian",
"jav": "Javanese",
"jpn": "Japanese",
"kan": "Kannada",
"kat": "Georgian",
"kaz": "Kazakh",
"khm": "Khmer",
"kir": "Kyrgyz",
"kor": "Korean",
"lao": "Lao",
"lat": "Latin",
"lav": "Latvian",
"lit": "Lithuanian",
"mal": "Malayalam",
"mar": "Marathi",
"mkd": "Macedonian",
"mlt": "Maltese",
"mon": "Mongolian",
"msa": "Malay",
"mya": "Burmese",
"nep": "Nepali",
"nld": "Dutch",
"nor": "Norwegian",
"ori": "Oriya",
"pan": "Punjabi",
"pol": "Polish",
"por": "Portuguese",
"pus": "Pashto",
"ron": "Romanian",
"rus": "Russian",
"san": "Sanskrit",
"sin": "Sinhala",
"slk": "Slovak",
"slv": "Slovenian",
"snd": "Sindhi",
"spa": "Spanish",
"sqi": "Albanian",
"srp": "Serbian",
"swa": "Swahili",
"swe": "Swedish",
"syr": "Syriac",
"tam": "Tamil",
"tel": "Telugu",
"tgk": "Tajik",
"tha": "Thai",
"tir": "Tigrinya",
"tur": "Turkish",
"uig": "Uyghur",
"ukr": "Ukrainian",
"urd": "Urdu",
"uzb": "Uzbek",
"vie": "Vietnamese",
"yid": "Yiddish"
}
TESS_LANGUAGE_TO_CODE = {v:k for k,v in TESS_CODE_TO_LANGUAGE.items()}