truong-xuan-linh's picture
inmit:
03484ca
raw
history blame
3.67 kB
from paddleocr import PaddleOCR
from vietocr.tool.config import Cfg
from vietocr.tool.predictor import Predictor
import cv2
import requests
import unidecode
import numpy as np
from PIL import Image, ImageFont, ImageDraw
class OCRDetector:
def __init__(self) -> None:
self.paddle_ocr = PaddleOCR(lang='en', use_angle_cls=False)
# config['weights'] = './weights/transformerocr.pth'
self.config = Cfg.load_config_from_name('vgg_transformer')
self.config['weights'] = "./storage/ocr_model.pth"
self.config['cnn']['pretrained']=False
self.config['device'] = "cpu"
self.config['predictor']['beamsearch']=False
self.viet_ocr = Predictor(self.config)
def find_box(self, image):
'''Xác định box dựa vào mô hình paddle_ocr'''
result = self.paddle_ocr.ocr(image, cls = False)
result = result[0]
# Extracting detected components
boxes = [res[0] for res in result]
texts = [{"text": res[1][0], "score": res[1][1]} for res in result]
# scores = [res[1][1] for res in result]
return boxes, texts
def vietnamese_text(self, boxes, image):
'''Xác định text dựa vào mô hình viet_ocr'''
texts = []
for box in boxes:
A = box[0]
B = box[1]
C = box[2]
D = box[3]
y1 = min(A[1], B[1])
y1 = int(max(0, y1 - max(0, 10 - abs(A[1] - B[1]))))
y2 = max(C[1], D[1])
y2 = int(y2 + max(0, 10 - abs(A[1] - B[1])))
x1 = int(max(0, min(A[0], D[0]) ))
x2 = int(max(B[0], C[0]) )
cut_image = image[y1:y2, x1:x2]
cut_image = Image.fromarray(np.uint8(cut_image))
text, score = self.viet_ocr.predict(cut_image, return_prob=True)
texts.append({"text": text,
"score": score})
return texts
#Merge
def text_detector(self, image_path, is_local=False):
if is_local:
image = Image.open(image_path).convert("RGB")
else:
image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
image = np.array(image)
boxes, paddle_texts = self.find_box(image)
if not boxes:
return image, None, None
viet_texts = self.vietnamese_text(boxes, image)
results_texts = []
for i, viet_txt in enumerate(viet_texts):
if viet_txt["text"] != unidecode.unidecode(viet_txt["text"]):
results_texts.append(viet_txt)
else:
results_texts.append(paddle_texts[i])
if results_texts != []:
return image, results_texts, boxes
else:
return image, None, None
def visualize_ocr(self, image, texts, boxes):
if not texts:
return image
img = image.copy()
for box, text in zip(boxes, texts):
(x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
h = y3 - y1
scl = max(h//1000,1)
font = ImageFont.truetype("./storage/Roboto-Black.ttf", 15*scl)
img = cv2.rectangle(img, (int(x1), int(y1)), (int(x3), int(y3)), (0, 255, 0), 1)
img_pil = Image.fromarray(img)
draw = ImageDraw.Draw(img_pil)
draw.text((int(x1), int(y1-h-3)), text["text"], font = font, fill = (51, 51, 255))
img = np.array(img_pil)
# img = cv2.putText(img, text["text"], (int(x1), int(y1)-3), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255,0,0), 1)
return img