File size: 4,055 Bytes
2720487 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from typing import List
from PIL import Image
from surya.detection import batch_text_detection
from surya.input.processing import slice_polys_from_image, slice_bboxes_from_image, convert_if_not_rgb
from surya.postprocessing.text import sort_text_lines
from surya.recognition import batch_recognition
from surya.schema import TextLine, OCRResult
def run_recognition(images: List[Image.Image], langs: List[List[str]], rec_model, rec_processor, bboxes: List[List[List[int]]] = None, polygons: List[List[List[List[int]]]] = None, batch_size=None) -> List[OCRResult]:
# Polygons need to be in corner format - [[x1, y1], [x2, y2], [x3, y3], [x4, y4]], bboxes in [x1, y1, x2, y2] format
assert bboxes is not None or polygons is not None
assert len(images) == len(langs), "You need to pass in one list of languages for each image"
images = convert_if_not_rgb(images)
slice_map = []
all_slices = []
all_langs = []
for idx, (image, lang) in enumerate(zip(images, langs)):
if polygons is not None:
slices = slice_polys_from_image(image, polygons[idx])
else:
slices = slice_bboxes_from_image(image, bboxes[idx])
slice_map.append(len(slices))
all_slices.extend(slices)
all_langs.extend([lang] * len(slices))
rec_predictions, _ = batch_recognition(all_slices, all_langs, rec_model, rec_processor, batch_size=batch_size)
predictions_by_image = []
slice_start = 0
for idx, (image, lang) in enumerate(zip(images, langs)):
slice_end = slice_start + slice_map[idx]
image_lines = rec_predictions[slice_start:slice_end]
slice_start = slice_end
text_lines = []
for i in range(len(image_lines)):
if polygons is not None:
poly = polygons[idx][i]
else:
bbox = bboxes[idx][i]
poly = [[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]]
text_lines.append(TextLine(
text=image_lines[i],
polygon=poly
))
pred = OCRResult(
text_lines=text_lines,
languages=lang,
image_bbox=[0, 0, image.size[0], image.size[1]]
)
predictions_by_image.append(pred)
return predictions_by_image
def run_ocr(images: List[Image.Image], langs: List[List[str]], det_model, det_processor, rec_model, rec_processor, batch_size=None) -> List[OCRResult]:
images = convert_if_not_rgb(images)
det_predictions = batch_text_detection(images, det_model, det_processor)
all_slices = []
slice_map = []
all_langs = []
for idx, (det_pred, image, lang) in enumerate(zip(det_predictions, images, langs)):
polygons = [p.polygon for p in det_pred.bboxes]
slices = slice_polys_from_image(image, polygons)
slice_map.append(len(slices))
all_langs.extend([lang] * len(slices))
all_slices.extend(slices)
rec_predictions, confidence_scores = batch_recognition(all_slices, all_langs, rec_model, rec_processor, batch_size=batch_size)
predictions_by_image = []
slice_start = 0
for idx, (image, det_pred, lang) in enumerate(zip(images, det_predictions, langs)):
slice_end = slice_start + slice_map[idx]
image_lines = rec_predictions[slice_start:slice_end]
line_confidences = confidence_scores[slice_start:slice_end]
slice_start = slice_end
assert len(image_lines) == len(det_pred.bboxes)
lines = []
for text_line, confidence, bbox in zip(image_lines, line_confidences, det_pred.bboxes):
lines.append(TextLine(
text=text_line,
polygon=bbox.polygon,
bbox=bbox.bbox,
confidence=confidence
))
lines = sort_text_lines(lines)
predictions_by_image.append(OCRResult(
text_lines=lines,
languages=lang,
image_bbox=det_pred.image_bbox
))
return predictions_by_image
|