stupidog04 commited on
Commit
1211632
1 Parent(s): 64387f8

accelerate easyocr for bbox

Browse files
Files changed (1) hide show
  1. app.py +28 -20
app.py CHANGED
@@ -50,32 +50,40 @@ def inference(video, lang, time_step):
50
  frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
51
  output_frames = []
52
  temporal_profiles = []
53
- max_boxes = 6
54
- compress_mp4 = True
55
 
56
  # Get the positions of the largest boxes in the first frame
57
- while success and not bounds:
58
- if count == 0:
59
- bounds = reader.readtext(frame)
60
- im = PIL.Image.fromarray(frame)
61
- im_with_boxes = draw_boxes(im, bounds)
62
- largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)[:max_boxes]
63
- positions = [box_position(b) for b in largest_boxes]
64
- temporal_profiles = [[] for _ in range(len(largest_boxes))]
65
- success, frame = vidcap.read()
66
- count += 1
67
 
68
  # Match bboxes to position and store the text read by OCR
69
  while success:
70
  if count % (int(frame_rate * time_step)) == 0:
71
- bounds = reader.readtext(frame)
72
- for box in bounds:
 
 
 
 
73
  bbox_pos = box_position(box)
74
- for i, position in enumerate(positions):
75
- distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
76
- if distance < 50:
77
- temporal_profiles[i].append((count / frame_rate, box[1]))
78
- break
 
 
 
 
 
 
 
 
 
79
  im = PIL.Image.fromarray(frame)
80
  im_with_boxes = draw_boxes(im, bounds)
81
  output_frames.append(np.array(im_with_boxes))
@@ -129,7 +137,7 @@ def inference(video, lang, time_step):
129
 
130
 
131
  title = '🖼️Video to Multilingual OCR👁️Gradio'
132
- description = 'Multilingual OCR which works conveniently on all devices in multiple languages. The maximal #boxes is 6 for detection'
133
  article = "<p style='text-align: center'></p>"
134
 
135
  examples = [
 
50
  frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
51
  output_frames = []
52
  temporal_profiles = []
53
+ compress_mp4 = False
 
54
 
55
  # Get the positions of the largest boxes in the first frame
56
+ bounds = reader.readtext(frame)
57
+ im = PIL.Image.fromarray(frame)
58
+ im_with_boxes = draw_boxes(im, bounds)
59
+ largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)
60
+ positions = [box_position(b) for b in largest_boxes]
61
+ temporal_profiles = [[] for _ in range(len(largest_boxes))]
 
 
 
 
62
 
63
  # Match bboxes to position and store the text read by OCR
64
  while success:
65
  if count % (int(frame_rate * time_step)) == 0:
66
+ if count % (int(frame_rate * time_step) * 30) == 0:
67
+ # update the largest boxes every 30 frames
68
+ bounds = reader.readtext(frame)
69
+ largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)
70
+ positions = [box_position(b) for b in largest_boxes]
71
+ for i, box in enumerate(largest_boxes):
72
  bbox_pos = box_position(box)
73
+ if np.linalg.norm(np.array(bbox_pos) - np.array(positions[i])) < 50:
74
+ x1, y1 = box[0][0]
75
+ x2, y2 = box[0][2]
76
+ box_width = x2 - x1
77
+ box_height = y2 - y1
78
+ ratio = 0.2
79
+ x1 = max(0, int(x1 - ratio * box_width))
80
+ x2 = min(frame.shape[1], int(x2 + ratio * box_width))
81
+ y1 = max(0, int(y1 - ratio * box_height))
82
+ y2 = min(frame.shape[0], int(y2 + ratio * box_height))
83
+ cropped_frame = frame[y1:y2, x1:x2]
84
+ text = reader.readtext(cropped_frame)
85
+ if text:
86
+ temporal_profiles[i].append((count / frame_rate, text[0][1]))
87
  im = PIL.Image.fromarray(frame)
88
  im_with_boxes = draw_boxes(im, bounds)
89
  output_frames.append(np.array(im_with_boxes))
 
137
 
138
 
139
  title = '🖼️Video to Multilingual OCR👁️Gradio'
140
+ description = 'Multilingual OCR which works conveniently on all devices in multiple languages.'
141
  article = "<p style='text-align: center'></p>"
142
 
143
  examples = [