google
/

owlv2-base-patch16-ensemble

@@ -34,29 +34,46 @@ The model uses a CLIP backbone with a ViT-B/16 Transformer architecture as an im
 ```python
 import requests
 from PIL import Image
 import torch
-from transformers import Owlv2Processor, Owlv2ForObjectDetection
-processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
 model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 texts = [["a photo of a cat", "a photo of a dog"]]
 inputs = processor(text=texts, images=image, return_tensors="pt")
-outputs = model(**inputs)
-# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-target_sizes = torch.Tensor([image.size[::-1]])
-# Convert outputs (bounding boxes and class logits) to COCO API
-results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
 i = 0  # Retrieve predictions for the first image for the corresponding text queries
 text = texts[i]
 boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-# Print detected objects and rescaled box coordinates
 for box, score, label in zip(boxes, scores, labels):
     box = [round(i, 2) for i in box.tolist()]
     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

 ```python
 import requests
 from PIL import Image
+import numpy as np
 import torch
+from transformers import AutoProcessor, Owlv2ForObjectDetection
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
 model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 texts = [["a photo of a cat", "a photo of a dog"]]
 inputs = processor(text=texts, images=image, return_tensors="pt")
+# forward pass
+with torch.no_grad():
+    outputs = model(**inputs)
+# Note: boxes need to be visualized on the padded, unnormalized image
+# hence we'll set the target image sizes (height, width) based on that
+def get_preprocessed_image(pixel_values):
+    pixel_values = pixel_values.squeeze().numpy()
+    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
+    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
+    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
+    unnormalized_image = Image.fromarray(unnormalized_image)
+    return unnormalized_image
+unnormalized_image = get_preprocessed_image(inputs.pixel_values)
+target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
+# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
+results = processor.post_process_object_detection(
+    outputs=outputs, threshold=0.2, target_sizes=target_sizes
+)
 i = 0  # Retrieve predictions for the first image for the corresponding text queries
 text = texts[i]
 boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
 for box, score, label in zip(boxes, scores, labels):
     box = [round(i, 2) for i in box.tolist()]
     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")