Video-to-Multilingual-OCR

Runtime error

App Files Files Community

stupidog04 commited on Apr 8, 2023

Commit

32f9f47

1 Parent(s): 5017f0e

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -6

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import easyocr
 import os
 from pathlib import Path
 import cv2
 #torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
@@ -26,10 +27,21 @@ def draw_boxes(image, bounds, color='yellow', width=2):
         draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
     return image
 def inference(video, lang, time_step):
-    # output = f"{Path(video).stem}_detected{Path(src).suffix}"
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
     vidcap = cv2.VideoCapture(video)
@@ -37,12 +49,35 @@ def inference(video, lang, time_step):
     count = 0
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     while success:
         if count % (int(frame_rate * time_step)) == 0:
             bounds = reader.readtext(frame)
             im = PIL.Image.fromarray(frame)
-            draw_boxes(im, bounds)
-            output_frames.append(np.array(im))
         success, frame = vidcap.read()
         count += 1
@@ -69,7 +104,15 @@ def inference(video, lang, time_step):
         f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
     )
     os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")
-    return output
 title = '🖼️Video to Multilingual OCR👁️Gradio'
@@ -104,7 +147,7 @@ gr.Interface(
     ],
     [
         gr.outputs.Video(label='Output Video'),
-        # gr.outputs.Dataframe(headers=['Text', 'Confidence'])
     ],
     title=title,
     description=description,

 import os
 from pathlib import Path
 import cv2
+import pandas as pd
 #torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
         draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
     return image
+def box_size(box):
+    points = box[0]
+    if len(points) == 4:
+        x1, y1 = points[0]
+        x2, y2 = points[2]
+        return abs(x1 - x2) * abs(y1 - y2)
+    else:
+        return 0
+def box_position(box):
+    return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
 def inference(video, lang, time_step):
     output = 'results.mp4'
     reader = easyocr.Reader(lang)
     bounds = []
     vidcap = cv2.VideoCapture(video)
     count = 0
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
+    temporal_profiles = []
+    max_boxes = 10
+    # Get the positions of the largest boxes in the first frame
+    while success and not bounds:
+        if count == 0:
+            bounds = reader.readtext(frame)
+            im = PIL.Image.fromarray(frame)
+            im_with_boxes = draw_boxes(im, bounds)
+            largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)[:max_boxes]
+            positions = [box_position(b) for b in largest_boxes]
+            temporal_profiles = [[] for _ in range(len(largest_boxes))]
+        success, frame = vidcap.read()
+        count += 1
+    # Match bboxes to position and store the text read by OCR
     while success:
         if count % (int(frame_rate * time_step)) == 0:
             bounds = reader.readtext(frame)
+            for box in bounds:
+                bbox_pos = box_position(box)
+                for i, position in enumerate(positions):
+                    distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
+                    if distance < 50:
+                        temporal_profiles[i].append((count / frame_rate, box[1]))
+                        break
             im = PIL.Image.fromarray(frame)
+            im_with_boxes = draw_boxes(im, bounds)
+            output_frames.append(np.array(im_with_boxes))
         success, frame = vidcap.read()
         count += 1
         f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
     )
     os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")
+    # Format temporal profiles as a DataFrame
+    df = pd.DataFrame(columns=["Box", "Time (s)", "Text"])
+    for i, profile in enumerate(temporal_profiles):
+        for t, text in profile:
+            df = df.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text}, ignore_index=True)
+    return output, df
 title = '🖼️Video to Multilingual OCR👁️Gradio'
     ],
     [
         gr.outputs.Video(label='Output Video'),
+        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'])
     ],
     title=title,
     description=description,