Video-to-Multilingual-OCR

Runtime error

App Files Files Community

stupidog04 commited on Apr 9, 2023

Commit

d3af935

•

1 Parent(s): 58c7220

add flag for full scan

Browse files

Files changed (1) hide show

app.py +31 -18

app.py CHANGED Viewed

@@ -40,7 +40,7 @@ def box_position(box):
     return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
-def inference(video, lang, time_step):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
@@ -63,23 +63,35 @@ def inference(video, lang, time_step):
     # Match bboxes to position and store the text read by OCR
     while success:
         if count % (int(frame_rate * time_step)) == 0:
-            for i, box in enumerate(largest_boxes):
-                x1, y1 = box[0][0]
-                x2, y2 = box[0][2]
-                box_width = x2 - x1
-                box_height = y2 - y1
-                ratio = 0.2
-                x1 = max(0, int(x1 - ratio * box_width))
-                x2 = min(frame.shape[1], int(x2 + ratio * box_width))
-                y1 = max(0, int(y1 - ratio * box_height))
-                y2 = min(frame.shape[0], int(y2 + ratio * box_height))
-                cropped_frame = frame[y1:y2, x1:x2]
-                text = reader.readtext(cropped_frame)
-                if text:
-                    temporal_profiles[i].append((count / frame_rate, text[0][1]))
             im = PIL.Image.fromarray(frame)
             im_with_boxes = draw_boxes(im, bounds)
             output_frames.append(np.array(im_with_boxes))
         success, frame = vidcap.read()
         count += 1
@@ -130,11 +142,11 @@ def inference(video, lang, time_step):
 title = '🖼️Video to Multilingual OCR👁️Gradio'
-description = 'Multilingual OCR which works conveniently on all devices in multiple languages.'
 article = "<p style='text-align: center'></p>"
 examples = [
-['test.mp4',['en'],10]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
@@ -155,7 +167,8 @@ gr.Interface(
     [
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
-        gr.inputs.Number(label='Time Step (in seconds)', default=1.0)
     ],
     [
         gr.outputs.Video(label='Output Video'),

     return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
+def inference(video, lang, time_step, full_scan=False):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
     # Match bboxes to position and store the text read by OCR
     while success:
         if count % (int(frame_rate * time_step)) == 0:
+            if full_scan:
+                bounds = reader.readtext(frame)
+                for box in bounds:
+                    bbox_pos = box_position(box)
+                    for i, position in enumerate(positions):
+                        distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
+                        if distance < 50:
+                            temporal_profiles[i].append((count / frame_rate, box[1]))
+                            break
+            else:
+                for i, box in enumerate(largest_boxes):
+                    x1, y1 = box[0][0]
+                    x2, y2 = box[0][2]
+                    box_width = x2 - x1
+                    box_height = y2 - y1
+                    ratio = 0.2
+                    x1 = max(0, int(x1 - ratio * box_width))
+                    x2 = min(frame.shape[1], int(x2 + ratio * box_width))
+                    y1 = max(0, int(y1 - ratio * box_height))
+                    y2 = min(frame.shape[0], int(y2 + ratio * box_height))
+                    cropped_frame = frame[y1:y2, x1:x2]
+                    text = reader.readtext(cropped_frame)
+                    if text:
+                        temporal_profiles[i].append((count / frame_rate, text[0][1]))
             im = PIL.Image.fromarray(frame)
             im_with_boxes = draw_boxes(im, bounds)
             output_frames.append(np.array(im_with_boxes))
         success, frame = vidcap.read()
         count += 1
 title = '🖼️Video to Multilingual OCR👁️Gradio'
+description = 'Multilingual OCR which works conveniently on all devices in multiple languages. Adjust time-step for inference and the scan mode according to your requirement. For `Full Scan`, model scan the whole image if flag is ture, while scan only the box detected at the first video frame; this save computation cost; noting that the box is fixed in this case.'
 article = "<p style='text-align: center'></p>"
 examples = [
+['test.mp4',['en'],10,False]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
     [
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
+        gr.inputs.Number(label='Time Step (in seconds)', default=1.0),
+        gr.inputs.Dropdown(['True', 'False'], label='Full Scan', default='False')
     ],
     [
         gr.outputs.Video(label='Output Video'),