Spaces:
Runtime error
Runtime error
stupidog04
commited on
Commit
•
32f9f47
1
Parent(s):
5017f0e
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ import easyocr
|
|
7 |
import os
|
8 |
from pathlib import Path
|
9 |
import cv2
|
|
|
10 |
|
11 |
|
12 |
#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
|
@@ -26,10 +27,21 @@ def draw_boxes(image, bounds, color='yellow', width=2):
|
|
26 |
draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
|
27 |
return image
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def inference(video, lang, time_step):
|
30 |
-
# output = f"{Path(video).stem}_detected{Path(src).suffix}"
|
31 |
output = 'results.mp4'
|
32 |
-
|
33 |
reader = easyocr.Reader(lang)
|
34 |
bounds = []
|
35 |
vidcap = cv2.VideoCapture(video)
|
@@ -37,12 +49,35 @@ def inference(video, lang, time_step):
|
|
37 |
count = 0
|
38 |
frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
|
39 |
output_frames = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
while success:
|
41 |
if count % (int(frame_rate * time_step)) == 0:
|
42 |
bounds = reader.readtext(frame)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
im = PIL.Image.fromarray(frame)
|
44 |
-
draw_boxes(im, bounds)
|
45 |
-
output_frames.append(np.array(
|
46 |
success, frame = vidcap.read()
|
47 |
count += 1
|
48 |
|
@@ -69,7 +104,15 @@ def inference(video, lang, time_step):
|
|
69 |
f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
|
70 |
)
|
71 |
os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
|
75 |
title = '🖼️Video to Multilingual OCR👁️Gradio'
|
@@ -104,7 +147,7 @@ gr.Interface(
|
|
104 |
],
|
105 |
[
|
106 |
gr.outputs.Video(label='Output Video'),
|
107 |
-
|
108 |
],
|
109 |
title=title,
|
110 |
description=description,
|
|
|
7 |
import os
|
8 |
from pathlib import Path
|
9 |
import cv2
|
10 |
+
import pandas as pd
|
11 |
|
12 |
|
13 |
#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
|
|
|
27 |
draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
|
28 |
return image
|
29 |
|
30 |
+
def box_size(box):
|
31 |
+
points = box[0]
|
32 |
+
if len(points) == 4:
|
33 |
+
x1, y1 = points[0]
|
34 |
+
x2, y2 = points[2]
|
35 |
+
return abs(x1 - x2) * abs(y1 - y2)
|
36 |
+
else:
|
37 |
+
return 0
|
38 |
+
|
39 |
+
def box_position(box):
|
40 |
+
return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
|
41 |
+
|
42 |
+
|
43 |
def inference(video, lang, time_step):
|
|
|
44 |
output = 'results.mp4'
|
|
|
45 |
reader = easyocr.Reader(lang)
|
46 |
bounds = []
|
47 |
vidcap = cv2.VideoCapture(video)
|
|
|
49 |
count = 0
|
50 |
frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
|
51 |
output_frames = []
|
52 |
+
temporal_profiles = []
|
53 |
+
max_boxes = 10
|
54 |
+
|
55 |
+
# Get the positions of the largest boxes in the first frame
|
56 |
+
while success and not bounds:
|
57 |
+
if count == 0:
|
58 |
+
bounds = reader.readtext(frame)
|
59 |
+
im = PIL.Image.fromarray(frame)
|
60 |
+
im_with_boxes = draw_boxes(im, bounds)
|
61 |
+
largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)[:max_boxes]
|
62 |
+
positions = [box_position(b) for b in largest_boxes]
|
63 |
+
temporal_profiles = [[] for _ in range(len(largest_boxes))]
|
64 |
+
success, frame = vidcap.read()
|
65 |
+
count += 1
|
66 |
+
|
67 |
+
# Match bboxes to position and store the text read by OCR
|
68 |
while success:
|
69 |
if count % (int(frame_rate * time_step)) == 0:
|
70 |
bounds = reader.readtext(frame)
|
71 |
+
for box in bounds:
|
72 |
+
bbox_pos = box_position(box)
|
73 |
+
for i, position in enumerate(positions):
|
74 |
+
distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
|
75 |
+
if distance < 50:
|
76 |
+
temporal_profiles[i].append((count / frame_rate, box[1]))
|
77 |
+
break
|
78 |
im = PIL.Image.fromarray(frame)
|
79 |
+
im_with_boxes = draw_boxes(im, bounds)
|
80 |
+
output_frames.append(np.array(im_with_boxes))
|
81 |
success, frame = vidcap.read()
|
82 |
count += 1
|
83 |
|
|
|
104 |
f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
|
105 |
)
|
106 |
os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")
|
107 |
+
|
108 |
+
# Format temporal profiles as a DataFrame
|
109 |
+
df = pd.DataFrame(columns=["Box", "Time (s)", "Text"])
|
110 |
+
for i, profile in enumerate(temporal_profiles):
|
111 |
+
for t, text in profile:
|
112 |
+
df = df.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text}, ignore_index=True)
|
113 |
+
|
114 |
+
return output, df
|
115 |
+
|
116 |
|
117 |
|
118 |
title = '🖼️Video to Multilingual OCR👁️Gradio'
|
|
|
147 |
],
|
148 |
[
|
149 |
gr.outputs.Video(label='Output Video'),
|
150 |
+
gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'])
|
151 |
],
|
152 |
title=title,
|
153 |
description=description,
|