baseplate
/

vit-gpt2-image-captioning

vision-encoder-decoder

image-text-to-text

image-captioning

Inference Endpoints

Model card Files Files and versions Community

Andrew Luo commited on Apr 5, 2023

Commit

865f97a

•

1 Parent(s): a31db03

handler

Files changed (2) hide show

handler.py +52 -0
requirements.txt +3 -0

handler.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+import torch
+from PIL import Image
+from typing import Dict, List, Any
+class EndpointHandler():
+    def __init__(self, path=""):
+        model = VisionEncoderDecoderModel.from_pretrained(
+            "nlpconnect/vit-gpt2-image-captioning")
+        feature_extractor = ViTImageProcessor.from_pretrained(
+            "nlpconnect/vit-gpt2-image-captioning")
+        tokenizer = AutoTokenizer.from_pretrained(
+            "nlpconnect/vit-gpt2-image-captioning")
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model.to(device)
+        self.model = model
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: `str`)
+            date (:obj: `str`)
+      Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        # get inputs
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        max_length = 128
+        num_beams = 4
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+        image_paths = data.pop("image_paths", data)
+        images = []
+        for image_path in image_paths:
+            i_image = Image.open(image_path)
+            if i_image.mode != "RGB":
+                i_image = i_image.convert(mode="RGB")
+            images.append(i_image)
+        pixel_values = self.feature_extractor(
+            images=images, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(device)
+        output_ids = self.model.generate(pixel_values, **gen_kwargs)
+        preds = self.tokenizer.batch_decode(
+            output_ids, skip_special_tokens=True)
+        preds = [pred.strip() for pred in preds]
+        return preds

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+transformers
+Pillow