sheraz179
/

blip2-flan-t5-xxl

visual-question-answering

image-captioning

Inference Endpoints

Model card Files Files and versions Community

blip2-flan-t5-xxl / handler.py

sheraz179's picture

Update handler.py

f6447a8 over 1 year ago

history blame contribute delete

1.83 kB

	from typing import Dict, List, Any
	from PIL import Image
	import torch
	import os, base64
	from io import BytesIO
	from transformers import Blip2ForConditionalGeneration, Blip2Processor

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


	class EndpointHandler():
	def __init__(self, path=""):
	# load the optimized model

	self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xxl")
	self.model = Blip2ForConditionalGeneration.from_pretrained(
	"Salesforce/blip2-flan-t5-xxl", load_in_8bit=True
	).to(device)
	self.model.eval()
	self.model = self.model.to(device)



	def __call__(self, data: Any) -> Dict[str, Any]:
	"""
	Args:
	data (:obj:):
	includes the input data and the parameters for the inference.
	Return:
	A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
	- "caption": A string corresponding to the generated caption.
	"""
	inputs = data.pop("inputs", data)
	parameters = data.pop("parameters", {})

	raw_images = [Image.open(BytesIO(base64.b64decode(_img))) for _img in inputs]

	processed_image = self.processor(images=raw_images, return_tensors="pt")
	processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
	processed_image = {processed_image, parameters}

	with torch.no_grad():
	out = self.model.generate(
	**processed_image
	)
	captions = self.processor.batch_decode(out, skip_special_tokens=True)
	# postprocess the prediction
	return {"captions": captions}