baseplate
/

vit-gpt2-image-captioning

vision-encoder-decoder

image-text-to-text

image-captioning

Inference Endpoints

Model card Files Files and versions Community

vit-gpt2-image-captioning / handler.py

Andrew Luo

handler

865f97a over 1 year ago

1.91 kB

	from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
	import torch
	from PIL import Image
	from typing import Dict, List, Any

	class EndpointHandler():
	def __init__(self, path=""):
	model = VisionEncoderDecoderModel.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning")
	feature_extractor = ViTImageProcessor.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning")
	tokenizer = AutoTokenizer.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	self.model = model
	self.feature_extractor = feature_extractor
	self.tokenizer = tokenizer

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	data args:
	inputs (:obj: `str`)
	date (:obj: `str`)
	Return:
	A :obj:`list` \| `dict`: will be serialized and returned
	"""
	# get inputs
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	max_length = 128
	num_beams = 4
	gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
	image_paths = data.pop("image_paths", data)
	images = []
	for image_path in image_paths:
	i_image = Image.open(image_path)
	if i_image.mode != "RGB":
	i_image = i_image.convert(mode="RGB")

	images.append(i_image)

	pixel_values = self.feature_extractor(
	images=images, return_tensors="pt").pixel_values
	pixel_values = pixel_values.to(device)

	output_ids = self.model.generate(pixel_values, **gen_kwargs)

	preds = self.tokenizer.batch_decode(
	output_ids, skip_special_tokens=True)
	preds = [pred.strip() for pred in preds]
	return preds