Spaces:

hvaldez
/

SViTT-Ego_Multiple_Choice_Question

Runtime error

hvaldez commited on Apr 10

Commit

ee924e5

•

1 Parent(s): c9ca468

updating for gpu

Files changed (2) hide show

app.py CHANGED Viewed

@@ -53,6 +53,7 @@ def main():
         "configs/ego_mcq/svitt.yml",
         sample_videos,
     )
     def predict(text):
         idx = sample_text_dict[text]
         ft_action, gt_action = svitt.predict(idx, text)

         "configs/ego_mcq/svitt.yml",
         sample_videos,
     )
     def predict(text):
         idx = sample_text_dict[text]
         ft_action, gt_action = svitt.predict(idx, text)

demo.py CHANGED Viewed

@@ -24,9 +24,13 @@ class VideoModel(nn.Module):
         Parameters:
             config: config file
         """
-        super(VideoModel, self).__init__()
         self.cfg = load_cfg(config)
         self.model = self.build_model()
         self.templates = ['{}']
         self.dataset = self.cfg['data']['dataset']
         self.eval()
@@ -74,7 +78,7 @@ class VideoModel(nn.Module):
 class VideoCLSModel(VideoModel):
     """ Video model for video classification tasks (Charades-Ego, EGTEA). """
     def __init__(self, config, sample_videos):
-        super(VideoCLSModel, self).__init__(config)
         self.sample_videos = sample_videos
         self.video_transform = self.init_video_transform()
@@ -125,7 +129,7 @@ class VideoCLSModel(VideoModel):
             truncation=True,
             max_length=self.model_cfg.max_txt_l.video,
             return_tensors="pt",
-        )
         _, class_embeddings = self.model.encode_text(embeddings)
         return class_embeddings
@@ -143,7 +147,7 @@ class VideoCLSModel(VideoModel):
         pooled_image_feat_all = []
         for i in range(clips.shape[0]):
-            images = clips[i,:].unsqueeze(0)
             bsz = images.shape[0]
             _, pooled_image_feat, *outputs = self.model.encode_image(images)
@@ -161,5 +165,5 @@ class VideoCLSModel(VideoModel):
     @torch.no_grad()
     def predict(self, idx, text=None):
         output, target = self.forward(idx, text)
-        return output.numpy(), target

         Parameters:
             config: config file
         """
+        super().__init__()
         self.cfg = load_cfg(config)
         self.model = self.build_model()
+        use_gpu = torch.cuda.is_available()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if use_gpu:
+            self.model = self.model.to(self.device)
         self.templates = ['{}']
         self.dataset = self.cfg['data']['dataset']
         self.eval()
 class VideoCLSModel(VideoModel):
     """ Video model for video classification tasks (Charades-Ego, EGTEA). """
     def __init__(self, config, sample_videos):
+        super().__init__(config)
         self.sample_videos = sample_videos
         self.video_transform = self.init_video_transform()
             truncation=True,
             max_length=self.model_cfg.max_txt_l.video,
             return_tensors="pt",
+        ).to(self.device)
         _, class_embeddings = self.model.encode_text(embeddings)
         return class_embeddings
         pooled_image_feat_all = []
         for i in range(clips.shape[0]):
+            images = clips[i,:].unsqueeze(0).to(self.device)
             bsz = images.shape[0]
             _, pooled_image_feat, *outputs = self.model.encode_image(images)
     @torch.no_grad()
     def predict(self, idx, text=None):
         output, target = self.forward(idx, text)
+        return output.cpu().numpy(), target