google
/

owlv2-large-patch14-ensemble

@@ -34,12 +34,11 @@ The model uses a CLIP backbone with a ViT-L/14 Transformer architecture as an im
 ```python
 import requests
 from PIL import Image
-import numpy as np
 import torch
-from transformers import AutoProcessor, Owlv2ForObjectDetection
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-processor = AutoProcessor.from_pretrained("google/owlv2-large-patch14-ensemble")
 model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble")
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -47,33 +46,16 @@ image = Image.open(requests.get(url, stream=True).raw)
 texts = [["a photo of a cat", "a photo of a dog"]]
 inputs = processor(text=texts, images=image, return_tensors="pt")
-# forward pass
 with torch.no_grad():
-    outputs = model(**inputs)
-# Note: boxes need to be visualized on the padded, unnormalized image
-# hence we'll set the target image sizes (height, width) based on that
-def get_preprocessed_image(pixel_values):
-    pixel_values = pixel_values.squeeze().numpy()
-    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
-    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
-    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
-    unnormalized_image = Image.fromarray(unnormalized_image)
-    return unnormalized_image
-unnormalized_image = get_preprocessed_image(inputs.pixel_values)
-target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
-# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
-results = processor.post_process_object_detection(
-    outputs=outputs, threshold=0.2, target_sizes=target_sizes
-)
 i = 0  # Retrieve predictions for the first image for the corresponding text queries
 text = texts[i]
 boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
 for box, score, label in zip(boxes, scores, labels):
     box = [round(i, 2) for i in box.tolist()]
     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

 ```python
 import requests
 from PIL import Image
 import torch
+from transformers import Owlv2Processor, Owlv2ForObjectDetection
+processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble")
 model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble")
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 texts = [["a photo of a cat", "a photo of a dog"]]
 inputs = processor(text=texts, images=image, return_tensors="pt")
 with torch.no_grad():
+  outputs = model(**inputs)
+# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+target_sizes = torch.Tensor([image.size[::-1]])
+# Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
+results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
 i = 0  # Retrieve predictions for the first image for the corresponding text queries
 text = texts[i]
 boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
 for box, score, label in zip(boxes, scores, labels):
     box = [round(i, 2) for i in box.tolist()]
     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")