Spaces:

yslan
/

ObjCtrl-2.5D

Running on Zero

ObjCtrl-2.5D / ZoeDepth /zoedepth /models /depth_model.py

wzhouxiff

init

38e3f9b 21 days ago

7.36 kB

	# MIT License

	# Copyright (c) 2022 Intelligent Systems Lab Org

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	# File author: Shariq Farooq Bhat

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torchvision import transforms
	import PIL.Image
	from PIL import Image
	from typing import Union


	class DepthModel(nn.Module):
	def __init__(self):
	super().__init__()
	self.device = 'cpu'

	def to(self, device) -> nn.Module:
	self.device = device
	return super().to(device)

	def forward(self, x, args, *kwargs):
	raise NotImplementedError

	def _infer(self, x: torch.Tensor):
	"""
	Inference interface for the model
	Args:
	x (torch.Tensor): input tensor of shape (b, c, h, w)
	Returns:
	torch.Tensor: output tensor of shape (b, 1, h, w)
	"""
	return self(x)['metric_depth']

	def _infer_with_pad_aug(self, x: torch.Tensor, pad_input: bool=True, fh: float=3, fw: float=3, upsampling_mode: str='bicubic', padding_mode="reflect", **kwargs) -> torch.Tensor:
	"""
	Inference interface for the model with padding augmentation
	Padding augmentation fixes the boundary artifacts in the output depth map.
	Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset which has a black or white border around the image.
	This augmentation pads the input image and crops the prediction back to the original size / view.

	Note: This augmentation is not required for the models trained with 'avoid_boundary'=True.
	Args:
	x (torch.Tensor): input tensor of shape (b, c, h, w)
	pad_input (bool, optional): whether to pad the input or not. Defaults to True.
	fh (float, optional): height padding factor. The padding is calculated as sqrt(h/2) * fh. Defaults to 3.
	fw (float, optional): width padding factor. The padding is calculated as sqrt(w/2) * fw. Defaults to 3.
	upsampling_mode (str, optional): upsampling mode. Defaults to 'bicubic'.
	padding_mode (str, optional): padding mode. Defaults to "reflect".
	Returns:
	torch.Tensor: output tensor of shape (b, 1, h, w)
	"""
	# assert x is nchw and c = 3
	assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim())
	assert x.shape[1] == 3, "x must have 3 channels, got {}".format(x.shape[1])

	if pad_input:
	assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0"
	pad_h = int(np.sqrt(x.shape[2]/2) * fh)
	pad_w = int(np.sqrt(x.shape[3]/2) * fw)
	padding = [pad_w, pad_w]
	if pad_h > 0:
	padding += [pad_h, pad_h]

	x = F.pad(x, padding, mode=padding_mode, **kwargs)
	out = self._infer(x)
	if out.shape[-2:] != x.shape[-2:]:
	out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False)
	if pad_input:
	# crop to the original size, handling the case where pad_h and pad_w is 0
	if pad_h > 0:
	out = out[:, :, pad_h:-pad_h,:]
	if pad_w > 0:
	out = out[:, :, :, pad_w:-pad_w]
	return out

	def infer_with_flip_aug(self, x, pad_input: bool=True, **kwargs) -> torch.Tensor:
	"""
	Inference interface for the model with horizontal flip augmentation
	Horizontal flip augmentation improves the accuracy of the model by averaging the output of the model with and without horizontal flip.
	Args:
	x (torch.Tensor): input tensor of shape (b, c, h, w)
	pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
	Returns:
	torch.Tensor: output tensor of shape (b, 1, h, w)
	"""
	# infer with horizontal flip and average
	out = self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
	out_flip = self._infer_with_pad_aug(torch.flip(x, dims=[3]), pad_input=pad_input, **kwargs)
	out = (out + torch.flip(out_flip, dims=[3])) / 2
	return out

	def infer(self, x, pad_input: bool=True, with_flip_aug: bool=True, **kwargs) -> torch.Tensor:
	"""
	Inference interface for the model
	Args:
	x (torch.Tensor): input tensor of shape (b, c, h, w)
	pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
	with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
	Returns:
	torch.Tensor: output tensor of shape (b, 1, h, w)
	"""
	if with_flip_aug:
	return self.infer_with_flip_aug(x, pad_input=pad_input, **kwargs)
	else:
	return self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)

	@torch.no_grad()
	def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, output_type: str="numpy", **kwargs) -> Union[np.ndarray, PIL.Image.Image, torch.Tensor]:
	"""
	Inference interface for the model for PIL image
	Args:
	pil_img (PIL.Image.Image): input PIL image
	pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
	with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
	output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy".
	"""
	x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device)
	out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs)
	if output_type == "numpy":
	return out_tensor.squeeze().cpu().numpy()
	elif output_type == "pil":
	# uint16 is required for depth pil image
	out_16bit_numpy = (out_tensor.squeeze().cpu().numpy()*256).astype(np.uint16)
	return Image.fromarray(out_16bit_numpy)
	elif output_type == "tensor":
	return out_tensor.squeeze().cpu()
	else:
	raise ValueError(f"output_type {output_type} not supported. Supported values are 'numpy', 'pil' and 'tensor'")