vl-1.3

Paused

App Files Files Community

vl-1.3 / deepseek_vl /models /image_processing_vlm.py

Nealeon

chore: rebase commits

79cf446 9 months ago

raw

history blame contribute delete

6.8 kB

	# Copyright (c) 2023-2024 DeepSeek.
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of
	# this software and associated documentation files (the "Software"), to deal in
	# the Software without restriction, including without limitation the rights to
	# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
	# the Software, and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
	# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
	# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
	# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	from typing import List, Tuple, Union

	import numpy as np
	import torch
	import torchvision
	import torchvision.transforms.functional
	from PIL import Image
	from transformers import AutoImageProcessor, PretrainedConfig
	from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
	from transformers.image_utils import to_numpy_array
	from transformers.utils import logging

	logger = logging.get_logger(__name__)

	ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
	IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
	IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
	IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
	IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)


	def expand2square(pil_img, background_color):
	width, height = pil_img.size
	if width == height:
	return pil_img
	elif width > height:
	result = Image.new(pil_img.mode, (width, width), background_color)
	result.paste(pil_img, (0, (width - height) // 2))
	return result
	else:
	result = Image.new(pil_img.mode, (height, height), background_color)
	result.paste(pil_img, ((height - width) // 2, 0))
	return result


	class VLMImageProcessorConfig(PretrainedConfig):
	model_type = "deepseek_vlm"
	image_size: int
	min_size: int
	image_mean: Union[Tuple[float, float, float], List[float]]
	image_std: Union[Tuple[float, float, float], List[float]]
	rescale_factor: float
	do_normalize: bool

	def __init__(
	self,
	image_size: int,
	min_size: int = 14,
	image_mean: Union[Tuple[float, float, float], List[float]] = (
	0.48145466,
	0.4578275,
	0.40821073,
	),
	image_std: Union[Tuple[float, float, float], List[float]] = (
	0.26862954,
	0.26130258,
	0.27577711,
	),
	rescale_factor: float = 1.0 / 255.0,
	do_normalize: bool = True,
	**kwargs,
	):
	self.image_size = image_size
	self.min_size = min_size
	self.image_mean = image_mean
	self.image_std = image_std
	self.rescale_factor = rescale_factor
	self.do_normalize = do_normalize

	super().__init__(**kwargs)


	class VLMImageProcessor(BaseImageProcessor):
	model_input_names = ["pixel_values"]

	def __init__(
	self,
	image_size: int,
	min_size: int = 14,
	image_mean: Union[Tuple[float, float, float], List[float]] = (
	0.48145466,
	0.4578275,
	0.40821073,
	),
	image_std: Union[Tuple[float, float, float], List[float]] = (
	0.26862954,
	0.26130258,
	0.27577711,
	),
	rescale_factor: float = 1.0 / 255.0,
	do_normalize: bool = True,
	**kwargs,
	):
	super().__init__(**kwargs)

	self.image_size = image_size
	self.rescale_factor = rescale_factor
	self.image_mean = image_mean
	self.image_std = image_std
	self.min_size = min_size
	self.do_normalize = do_normalize

	if image_mean is None:
	self.background_color = (127, 127, 127)
	else:
	self.background_color = tuple([int(x * 255) for x in image_mean])

	def resize(self, pil_img: Image) -> np.ndarray:
	"""

	Args:
	pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB

	Returns:
	x (np.ndarray): [3, self.image_size, self.image_size]
	"""

	width, height = pil_img.size
	max_size = max(width, height)

	size = [
	max(int(height / max_size * self.image_size), self.min_size),
	max(int(width / max_size * self.image_size), self.min_size),
	]

	if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0:
	print(f"orig size = {pil_img.size}, new size = {size}")
	raise ValueError("Invalid size!")

	pil_img = torchvision.transforms.functional.resize(
	pil_img,
	size,
	interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC,
	antialias=True,
	)

	pil_img = expand2square(pil_img, self.background_color)
	x = to_numpy_array(pil_img)

	# [H, W, 3] -> [3, H, W]
	x = np.transpose(x, (2, 0, 1))

	return x

	def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature:
	# resize and pad to [self.image_size, self.image_size]
	# then convert from [H, W, 3] to [3, H, W]
	images: List[np.ndarray] = [self.resize(image) for image in images]

	# resacle from [0, 255] -> [0, 1]
	images = [
	self.rescale(
	image=image,
	scale=self.rescale_factor,
	input_data_format="channels_first",
	)
	for image in images
	]

	# normalize
	if self.do_normalize:
	images = [
	self.normalize(
	image=image,
	mean=self.image_mean,
	std=self.image_std,
	input_data_format="channels_first",
	)
	for image in images
	]

	data = {"pixel_values": images}
	return BatchFeature(data=data, tensor_type=return_tensors)

	@property
	def default_shape(self):
	return [3, self.image_size, self.image_size]


	AutoImageProcessor.register(VLMImageProcessorConfig, VLMImageProcessor)


	if __name__ == "__main__":
	image_processor = VLMImageProcessor(
	image_size=1024,
	image_mean=IMAGENET_INCEPTION_MEAN,
	image_std=IMAGENET_INCEPTION_STD,
	do_normalize=True,
	)