Spaces:

adirik
/

OWL-ViT

Runtime error

ceyda commited on Aug 9, 2022

Commit

5a3f926

1 Parent(s): 12d1976

Don't resize output image

Resizing input before passing it to the feature extractor produces correct sized outputs.
I'm not sure if there is a problem with the post processing,
or pre-processor was just not resizing input images correctly.

Files changed (1) hide show

app.py +7 -4

app.py CHANGED Viewed

@@ -19,19 +19,22 @@ processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
 def query_image(img, text_queries, score_threshold):
     text_queries = text_queries
     text_queries = text_queries.split(",")
-    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model(**inputs)
-    target_sizes = torch.Tensor([[768, 768]])
     outputs.logits = outputs.logits.cpu()
     outputs.pred_boxes = outputs.pred_boxes.cpu()
     results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
-    img = cv2.resize(img, (768, 768), interpolation = cv2.INTER_AREA)
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
@@ -61,7 +64,7 @@ can also use the score threshold slider to set a threshold to filter out low pro
 """
 demo = gr.Interface(
     query_image,
-    inputs=[gr.Image(shape=(768, 768)), "text", gr.Slider(0, 1, value=0.1)],
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,

 def query_image(img, text_queries, score_threshold):
     text_queries = text_queries
     text_queries = text_queries.split(",")
+    target_sizes = torch.Tensor([img.shape[:2]])
+    img_input = cv2.resize(img, (768, 768), interpolation = cv2.INTER_AREA)
+    inputs = processor(text=text_queries, images=img_input, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model(**inputs)
     outputs.logits = outputs.logits.cpu()
     outputs.pred_boxes = outputs.pred_boxes.cpu()
     results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
 """
 demo = gr.Interface(
     query_image,
+    inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,