Video-to-Multilingual-OCR

Runtime error

App Files Files Community

stupidog04 commited on Apr 14, 2023

Commit

83a4675

•

1 Parent(s): 658f973

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -31

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import os
 from pathlib import Path
 import cv2
 import pandas as pd
 #torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
@@ -20,6 +21,7 @@ torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/e
 torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/japanese.jpg', 'japanese.jpg')
 torch.hub.download_url_to_file('https://i.imgur.com/mwQFd7G.jpeg', 'Hindi.jpeg')
 def draw_boxes(image, bounds, color='yellow', width=2):
     draw = ImageDraw.Draw(image)
     for bound in bounds:
@@ -39,8 +41,29 @@ def box_size(box):
 def box_position(box):
     return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
-def inference(video, lang, time_step, full_scan=False):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
@@ -50,22 +73,25 @@ def inference(video, lang, time_step, full_scan=False):
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     temporal_profiles = []
-    compress_mp4 = True
     # Get the positions of the largest boxes in the first frame
     bounds = reader.readtext(frame)
     im = PIL.Image.fromarray(frame)
     im_with_boxes = draw_boxes(im, bounds)
     largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)
     positions = [box_position(b) for b in largest_boxes]
     temporal_profiles = [[] for _ in range(len(largest_boxes))]
-    # Match bboxes to position and store the text read by OCR
     # Match bboxes to position and store the text read by OCR
-    if full_scan:
-        # Match bboxes to position and store the text read by OCR
-        while success:
-            if count % (int(frame_rate * time_step)) == 0:
                 bounds = reader.readtext(frame)
                 for box in bounds:
                     bbox_pos = box_position(box)
@@ -74,15 +100,7 @@ def inference(video, lang, time_step, full_scan=False):
                         if distance < 50:
                             temporal_profiles[i].append((count / frame_rate, box[1]))
                             break
-                im = PIL.Image.fromarray(frame)
-                im_with_boxes = draw_boxes(im, bounds)
-                output_frames.append(np.array(im_with_boxes))
-            success, frame = vidcap.read()
-            count += 1
-    else:
-        # Match bboxes to position and store the text read by OCR
-        while success:
-            if count % (int(frame_rate * time_step)) == 0:
                 for i, box in enumerate(largest_boxes):
                     x1, y1 = box[0][0]
                     x2, y2 = box[0][2]
@@ -94,15 +112,27 @@ def inference(video, lang, time_step, full_scan=False):
                     y1 = max(0, int(y1 - ratio * box_height))
                     y2 = min(frame.shape[0], int(y2 + ratio * box_height))
                     cropped_frame = frame[y1:y2, x1:x2]
-                    text = reader.readtext(cropped_frame)
-                    if text:
-                        temporal_profiles[i].append((count / frame_rate, text[0][1]))
-                im = PIL.Image.fromarray(frame)
-                im_with_boxes = draw_boxes(im, bounds)
-                output_frames.append(np.array(im_with_boxes))
-            success, frame = vidcap.read()
-            count += 1
     # Default resolutions of the frame are obtained. The default resolutions are system dependent.
     # We convert the resolutions from float to integer.
     width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -150,11 +180,11 @@ def inference(video, lang, time_step, full_scan=False):
 title = '🖼️Video to Multilingual OCR👁️Gradio'
-description = 'Multilingual OCR which works conveniently on all devices in multiple languages. Adjust time-step for inference and the scan mode according to your requirement. For `Full Scan`, model scan the whole image if flag is ture, while scan only the box detected at the first video frame; this save computation cost; noting that the box is fixed in this case.'
 article = "<p style='text-align: center'></p>"
 examples = [
-['test.mp4',['en'],10,False]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
@@ -176,12 +206,15 @@ gr.Interface(
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
         gr.inputs.Number(label='Time Step (in seconds)', default=1.0),
-        gr.inputs.Dropdown(['True', 'False'], label='Full Scan', default='False')
     ],
     [
         gr.outputs.Video(label='Output Video'),
         gr.outputs.Image(label='Output Preview', type='numpy'),
-        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'], type='pandas')
     ],
     title=title,
     description=description,

 from pathlib import Path
 import cv2
 import pandas as pd
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 #torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
 torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/japanese.jpg', 'japanese.jpg')
 torch.hub.download_url_to_file('https://i.imgur.com/mwQFd7G.jpeg', 'Hindi.jpeg')
 def draw_boxes(image, bounds, color='yellow', width=2):
     draw = ImageDraw.Draw(image)
     for bound in bounds:
 def box_position(box):
     return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
+def filter_temporal_profiles(temporal_profiles, period_index):
+    filtered_profiles = []
+    for profile in temporal_profiles:
+        filtered_profile = []
+        for t, text in profile:
+            # Remove all non-digit characters from text
+            filtered_text = ''.join(filter(str.isdigit, text))
+            # Insert period at the specified index
+            filtered_text = filtered_text[:period_index] + "." + filtered_text[period_index:]
+            try:
+                filtered_value = float(filtered_text)
+            except ValueError:
+                continue
+            filtered_profile.append((t, filtered_value))
+        filtered_profiles.append(filtered_profile)
+    return filtered_profiles
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-printed')
+model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-printed').to(device)
+def inference(video, lang, time_step, full_scan, number_filter, use_trocr, period_index):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     temporal_profiles = []
+    compress_mp4 = False
     # Get the positions of the largest boxes in the first frame
     bounds = reader.readtext(frame)
+    for i in reversed(range(len(bounds))):
+        box = bounds[i]
+        # Remove box if it doesn't contain a number
+        if not any(char.isdigit() for char in box[1]):
+            bounds.pop(i)
     im = PIL.Image.fromarray(frame)
     im_with_boxes = draw_boxes(im, bounds)
     largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)
     positions = [box_position(b) for b in largest_boxes]
     temporal_profiles = [[] for _ in range(len(largest_boxes))]
     # Match bboxes to position and store the text read by OCR
+    while success:
+        if count % (int(frame_rate * time_step)) == 0:
+            if full_scan:
                 bounds = reader.readtext(frame)
                 for box in bounds:
                     bbox_pos = box_position(box)
                         if distance < 50:
                             temporal_profiles[i].append((count / frame_rate, box[1]))
                             break
+            else:
                 for i, box in enumerate(largest_boxes):
                     x1, y1 = box[0][0]
                     x2, y2 = box[0][2]
                     y1 = max(0, int(y1 - ratio * box_height))
                     y2 = min(frame.shape[0], int(y2 + ratio * box_height))
                     cropped_frame = frame[y1:y2, x1:x2]
+                    if use_trocr:
+                        pixel_values = processor(images=cropped_frame, return_tensors="pt").pixel_values
+                        generated_ids = model.generate(pixel_values.to(device))
+                        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                        temporal_profiles[i].append((count / frame_rate, generated_text))
+                    else:
+                        text = reader.readtext(cropped_frame)
+                        if text:
+                            temporal_profiles[i].append((count / frame_rate, text[0][1]))
+            im = PIL.Image.fromarray(frame)
+            im_with_boxes = draw_boxes(im, bounds)
+            output_frames.append(np.array(im_with_boxes))
+        success, frame = vidcap.read()
+        count += 1
+    if number_filter:
+        # Filter the temporal profiles by removing non-matching characters and converting to floats
+        temporal_profiles = filter_temporal_profiles(temporal_profiles, int(period_index))
     # Default resolutions of the frame are obtained. The default resolutions are system dependent.
     # We convert the resolutions from float to integer.
     width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH))
 title = '🖼️Video to Multilingual OCR👁️Gradio'
+description = 'Multilingual OCR which works conveniently on all devices in multiple languages. Adjust time-step for inference and the scan mode according to your requirement. For `Full Screen Scan`, model scan the whole image if flag is ture, while scan only the box detected at the first video frame; this accelerate the inference while detecting the fixed box.'
 article = "<p style='text-align: center'></p>"
 examples = [
+['test.mp4',['en'],10,]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
         gr.inputs.Number(label='Time Step (in seconds)', default=1.0),
+        gr.inputs.Checkbox(label='Full Screen Scan'),
+        gr.inputs.Checkbox(label='Use TrOCR large (this is only available when Full Screen Scan is disable)'),
+        gr.inputs.Checkbox(label='Number Filter (remove non-digit char and insert period)'),
+        gr.inputs.Textbox(label="period position",default=1)
     ],
     [
         gr.outputs.Video(label='Output Video'),
         gr.outputs.Image(label='Output Preview', type='numpy'),
+        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'], type='pandas'),
     ],
     title=title,
     description=description,