Spaces:

BridgeTower
/

bridgetower-video-search

Runtime error

App Files Files Community

shaoyent commited on Feb 11, 2023

Commit

ec2fb01

1 Parent(s): b345be7

Update

Browse files

Files changed (2) hide show

app.py +25 -14
bridgetower_custom.py +2 -2

app.py CHANGED Viewed

@@ -87,7 +87,7 @@ def time_to_frame(time, fps):
     '''
         convert time in seconds into frame number
     '''
-    return time * fps - 1
 def str2time(strtime):
     strtime = strtime.strip('"')
@@ -105,7 +105,7 @@ def collate_fn(batch_list):
     batch['pixel_mask']   = torch.cat([encoding['pixel_mask'] for encoding in batch_list], dim=0)
     return batch
-def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=False, batch_size=2):
     if os.path.exists(os.path.join(output, 'embeddings.pkl')):
         return
@@ -123,7 +123,7 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
     # Get the total numer of frames in the video.
     frame_count = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
-    print(fps, frame_count)
     frame_number = 0
@@ -132,8 +132,9 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
     embeddings = []
     batch_list = []
-    for idx, caption in enumerate(webvtt.read(subtitles)):
         st_time = str2time(caption.start)
         ed_time = str2time(caption.end)
@@ -144,9 +145,10 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
             raise NotImplementedError
         frame_no =  time_to_frame(mid_time, fps)
         print('Read a new frame: ', idx, mid_time, frame_no, text)
-        vidcap.set(1, frame_no)    # added this line
         success, frame = vidcap.read()
         if success:
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
@@ -161,7 +163,7 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
                 'image_id': idx,
                 'img_fname': img_fname,
                 'caption': text,
-                'time': mid_time,
                 'frame_no': frame_no
             })
@@ -169,6 +171,7 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
             encoding['text'] = text
             encoding['image_filepath'] = img_fpath
             encoding['start_time'] = caption.start
             batch_list.append(encoding)
@@ -186,7 +189,7 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
                     'text': batch_list[i]['text'],
                     'image_filepath': batch_list[i]['image_filepath'],
                     'start_time': batch_list[i]['start_time'],
-                    'frame_no': frame_no,
                 })
             batch_list = []
@@ -201,9 +204,11 @@ def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=
                 'text': batch_list[i]['text'],
                 'image_filepath': batch_list[i]['image_filepath'],
                 'start_time': batch_list[i]['start_time'],
-                'frame_no': frame_no,
             })
     with open(os.path.join(output, 'annotations.json'), 'w') as fh:
         json.dump(anno, fh)
@@ -240,10 +245,14 @@ def run_query(video_path, text_query, path='/tmp'):
     clip_images = []
     transcripts = []
     for idx in I[0]:
-        frame_no = embeddings[idx]['frame_no']
-        vidcap.set(1, frame_no)    # added this line
         success, frame = vidcap.read()
         if success:
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frame = Image.fromarray(frame)
             clip_images.append(frame)
@@ -277,7 +286,7 @@ def get_video_id_from_url(video_url):
     return None
-def process(video_url, text_query):
     tmp_dir = os.environ.get('TMPDIR', '/tmp')
     video_id = get_video_id_from_url(video_url)
     output_dir = os.path.join(tmp_dir, video_id)
@@ -289,6 +298,7 @@ def process(video_url, text_query):
         output=output_dir,
         expanded=False,
         batch_size=8,
     )
     frame_paths, transcripts = run_query(video_file, text_query, path=output_dir)
     return video_file, [(image, caption) for image, caption in zip(frame_paths, transcripts)]
@@ -311,8 +321,8 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
             ['https://www.youtube.com/watch?v=CvjoXdC-WkM','wedding'],
-            ['https://www.youtube.com/watch?v=fWs2dWcNGu0', 'cheesecake on floor'],
-            ['https://www.youtube.com/watch?v=rmPpNsx4yAk', 'cat woman'],
             ['https://www.youtube.com/watch?v=KCFYf4TJdN0' ,'sandwich'],
         ],
         inputs=[video_url, text_query],
@@ -324,6 +334,7 @@ with gr.Blocks() as demo:
     )
 try:
     demo.launch(share=True)
 except:
     demo.launch()

     '''
         convert time in seconds into frame number
     '''
+    return int(time * fps - 1)
 def str2time(strtime):
     strtime = strtime.strip('"')
     batch['pixel_mask']   = torch.cat([encoding['pixel_mask'] for encoding in batch_list], dim=0)
     return batch
+def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=False, batch_size=2, progress=gr.Progress()):
     if os.path.exists(os.path.join(output, 'embeddings.pkl')):
         return
     # Get the total numer of frames in the video.
     frame_count = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
+    # print(fps, frame_count)
     frame_number = 0
     embeddings = []
     batch_list = []
+    vtt = webvtt.read(subtitles)
+    for idx, caption in progress.tqdm(enumerate(vtt), total=vtt.total_length, desc="Generating embeddings"):
         st_time = str2time(caption.start)
         ed_time = str2time(caption.end)
             raise NotImplementedError
         frame_no =  time_to_frame(mid_time, fps)
+        mid_time_ms = mid_time * 1000
+        # vidcap.set(1, frame_no)    # added this line
+        vidcap.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
         print('Read a new frame: ', idx, mid_time, frame_no, text)
         success, frame = vidcap.read()
         if success:
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 'image_id': idx,
                 'img_fname': img_fname,
                 'caption': text,
+                'time': mid_time_ms,
                 'frame_no': frame_no
             })
             encoding['text'] = text
             encoding['image_filepath'] = img_fpath
             encoding['start_time'] = caption.start
+            encoding['time'] = mid_time_ms
             batch_list.append(encoding)
                     'text': batch_list[i]['text'],
                     'image_filepath': batch_list[i]['image_filepath'],
                     'start_time': batch_list[i]['start_time'],
+                    'time': batch_list[i]['time'],
                 })
             batch_list = []
                 'text': batch_list[i]['text'],
                 'image_filepath': batch_list[i]['image_filepath'],
                 'start_time': batch_list[i]['start_time'],
+                'time': batch_list[i]['time'],
             })
+        batch_list = []
     with open(os.path.join(output, 'annotations.json'), 'w') as fh:
         json.dump(anno, fh)
     clip_images = []
     transcripts = []
     for idx in I[0]:
+        # frame_no = embeddings[idx]['frame_no']
+        # vidcap.set(1, frame_no)    # added this line
+        frame_timestamp = embeddings[idx]['time']
+        vidcap.set(cv2.CAP_PROP_POS_MSEC, frame_timestamp)
         success, frame = vidcap.read()
         if success:
+            frame = maintain_aspect_ratio_resize(frame, height=400)
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frame = Image.fromarray(frame)
             clip_images.append(frame)
     return None
+def process(video_url, text_query, progress=gr.Progress()):
     tmp_dir = os.environ.get('TMPDIR', '/tmp')
     video_id = get_video_id_from_url(video_url)
     output_dir = os.path.join(tmp_dir, video_id)
         output=output_dir,
         expanded=False,
         batch_size=8,
+        progress=gr.Progress(),
     )
     frame_paths, transcripts = run_query(video_file, text_query, path=output_dir)
     return video_file, [(image, caption) for image, caption in zip(frame_paths, transcripts)]
     gr.Examples(
         examples=[
             ['https://www.youtube.com/watch?v=CvjoXdC-WkM','wedding'],
+            ['https://www.youtube.com/watch?v=fWs2dWcNGu0', 'cheesecake'],
+            ['https://www.youtube.com/watch?v=rmPpNsx4yAk', 'bunny'],
             ['https://www.youtube.com/watch?v=KCFYf4TJdN0' ,'sandwich'],
         ],
         inputs=[video_url, text_query],
     )
 try:
+    demo.queue(concurrency_count=3)
     demo.launch(share=True)
 except:
     demo.launch()

bridgetower_custom.py CHANGED Viewed

@@ -96,8 +96,8 @@ class BridgeTowerTextFeatureExtractor(BridgeTowerPreTrainedModel):
         labels: Optional[torch.LongTensor] = None,
     ):
-        outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask)
-        final_hidden_cls = outputs.last_hidden_state[:,0,:]
         final_hidden_cls = F.normalize(self.itc_text_head(final_hidden_cls), dim=-1, p=2)
         return final_hidden_cls

         labels: Optional[torch.LongTensor] = None,
     ):
+        outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+        final_hidden_cls = outputs.hidden_states[-1][:,0,:]
         final_hidden_cls = F.normalize(self.itc_text_head(final_hidden_cls), dim=-1, p=2)
         return final_hidden_cls