stupidog04 commited on
Commit
d3af935
1 Parent(s): 58c7220

add flag for full scan

Browse files
Files changed (1) hide show
  1. app.py +31 -18
app.py CHANGED
@@ -40,7 +40,7 @@ def box_position(box):
40
  return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
41
 
42
 
43
- def inference(video, lang, time_step):
44
  output = 'results.mp4'
45
  reader = easyocr.Reader(lang)
46
  bounds = []
@@ -63,23 +63,35 @@ def inference(video, lang, time_step):
63
  # Match bboxes to position and store the text read by OCR
64
  while success:
65
  if count % (int(frame_rate * time_step)) == 0:
66
- for i, box in enumerate(largest_boxes):
67
- x1, y1 = box[0][0]
68
- x2, y2 = box[0][2]
69
- box_width = x2 - x1
70
- box_height = y2 - y1
71
- ratio = 0.2
72
- x1 = max(0, int(x1 - ratio * box_width))
73
- x2 = min(frame.shape[1], int(x2 + ratio * box_width))
74
- y1 = max(0, int(y1 - ratio * box_height))
75
- y2 = min(frame.shape[0], int(y2 + ratio * box_height))
76
- cropped_frame = frame[y1:y2, x1:x2]
77
- text = reader.readtext(cropped_frame)
78
- if text:
79
- temporal_profiles[i].append((count / frame_rate, text[0][1]))
 
 
 
 
 
 
 
 
 
 
 
80
  im = PIL.Image.fromarray(frame)
81
  im_with_boxes = draw_boxes(im, bounds)
82
  output_frames.append(np.array(im_with_boxes))
 
83
  success, frame = vidcap.read()
84
  count += 1
85
 
@@ -130,11 +142,11 @@ def inference(video, lang, time_step):
130
 
131
 
132
  title = '🖼️Video to Multilingual OCR👁️Gradio'
133
- description = 'Multilingual OCR which works conveniently on all devices in multiple languages.'
134
  article = "<p style='text-align: center'></p>"
135
 
136
  examples = [
137
- ['test.mp4',['en'],10]
138
  ]
139
 
140
  css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
@@ -155,7 +167,8 @@ gr.Interface(
155
  [
156
  gr.inputs.Video(label='Input Video'),
157
  gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
158
- gr.inputs.Number(label='Time Step (in seconds)', default=1.0)
 
159
  ],
160
  [
161
  gr.outputs.Video(label='Output Video'),
 
40
  return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2
41
 
42
 
43
+ def inference(video, lang, time_step, full_scan=False):
44
  output = 'results.mp4'
45
  reader = easyocr.Reader(lang)
46
  bounds = []
 
63
  # Match bboxes to position and store the text read by OCR
64
  while success:
65
  if count % (int(frame_rate * time_step)) == 0:
66
+ if full_scan:
67
+ bounds = reader.readtext(frame)
68
+ for box in bounds:
69
+ bbox_pos = box_position(box)
70
+ for i, position in enumerate(positions):
71
+ distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
72
+ if distance < 50:
73
+ temporal_profiles[i].append((count / frame_rate, box[1]))
74
+ break
75
+ else:
76
+ for i, box in enumerate(largest_boxes):
77
+ x1, y1 = box[0][0]
78
+ x2, y2 = box[0][2]
79
+ box_width = x2 - x1
80
+ box_height = y2 - y1
81
+ ratio = 0.2
82
+ x1 = max(0, int(x1 - ratio * box_width))
83
+ x2 = min(frame.shape[1], int(x2 + ratio * box_width))
84
+ y1 = max(0, int(y1 - ratio * box_height))
85
+ y2 = min(frame.shape[0], int(y2 + ratio * box_height))
86
+ cropped_frame = frame[y1:y2, x1:x2]
87
+ text = reader.readtext(cropped_frame)
88
+ if text:
89
+ temporal_profiles[i].append((count / frame_rate, text[0][1]))
90
+
91
  im = PIL.Image.fromarray(frame)
92
  im_with_boxes = draw_boxes(im, bounds)
93
  output_frames.append(np.array(im_with_boxes))
94
+
95
  success, frame = vidcap.read()
96
  count += 1
97
 
 
142
 
143
 
144
  title = '🖼️Video to Multilingual OCR👁️Gradio'
145
+ description = 'Multilingual OCR which works conveniently on all devices in multiple languages. Adjust time-step for inference and the scan mode according to your requirement. For `Full Scan`, model scan the whole image if flag is ture, while scan only the box detected at the first video frame; this save computation cost; noting that the box is fixed in this case.'
146
  article = "<p style='text-align: center'></p>"
147
 
148
  examples = [
149
+ ['test.mp4',['en'],10,False]
150
  ]
151
 
152
  css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
 
167
  [
168
  gr.inputs.Video(label='Input Video'),
169
  gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
170
+ gr.inputs.Number(label='Time Step (in seconds)', default=1.0),
171
+ gr.inputs.Dropdown(['True', 'False'], label='Full Scan', default='False')
172
  ],
173
  [
174
  gr.outputs.Video(label='Output Video'),