Spaces:
Running
on
L40S
Running
on
L40S
# Copyright (c) Facebook, Inc. and its affiliates. | |
from typing import Any, List | |
import torch | |
from torch.nn import functional as F | |
from detectron2.config import CfgNode | |
from detectron2.structures import Instances | |
from .mask_or_segm import MaskOrSegmentationLoss | |
from .registry import DENSEPOSE_LOSS_REGISTRY | |
from .utils import ( | |
BilinearInterpolationHelper, | |
ChartBasedAnnotationsAccumulator, | |
LossDict, | |
extract_packed_annotations_from_matches, | |
) | |
class DensePoseChartLoss: | |
""" | |
DensePose loss for chart-based training. A mesh is split into charts, | |
each chart is given a label (I) and parametrized by 2 coordinates referred to | |
as U and V. Ground truth consists of a number of points annotated with | |
I, U and V values and coarse segmentation S defined for all pixels of the | |
object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`), | |
semantic segmentation annotations can be used as ground truth inputs as well. | |
Estimated values are tensors: | |
* U coordinates, tensor of shape [N, C, S, S] | |
* V coordinates, tensor of shape [N, C, S, S] | |
* fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized | |
scores for each fine segmentation label at each location | |
* coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized | |
scores for each coarse segmentation label at each location | |
where N is the number of detections, C is the number of fine segmentation | |
labels, S is the estimate size ( = width = height) and D is the number of | |
coarse segmentation channels. | |
The losses are: | |
* regression (smooth L1) loss for U and V coordinates | |
* cross entropy loss for fine (I) and coarse (S) segmentations | |
Each loss has an associated weight | |
""" | |
def __init__(self, cfg: CfgNode): | |
""" | |
Initialize chart-based loss from configuration options | |
Args: | |
cfg (CfgNode): configuration options | |
""" | |
# fmt: off | |
self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE | |
self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS | |
self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS | |
self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS | |
self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS | |
# fmt: on | |
self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS | |
self.segm_loss = MaskOrSegmentationLoss(cfg) | |
def __call__( | |
self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs | |
) -> LossDict: | |
""" | |
Produce chart-based DensePose losses | |
Args: | |
proposals_with_gt (list of Instances): detections with associated ground truth data | |
densepose_predictor_outputs: an object of a dataclass that contains predictor outputs | |
with estimated values; assumed to have the following attributes: | |
* coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] | |
* fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] | |
* u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
* v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
where N is the number of detections, C is the number of fine segmentation | |
labels, S is the estimate size ( = width = height) and D is the number of | |
coarse segmentation channels. | |
Return: | |
dict: str -> tensor: dict of losses with the following entries: | |
* `loss_densepose_U`: smooth L1 loss for U coordinate estimates | |
* `loss_densepose_V`: smooth L1 loss for V coordinate estimates | |
* `loss_densepose_I`: cross entropy for raw unnormalized scores for fine | |
segmentation estimates given ground truth labels; | |
* `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse | |
segmentation estimates given ground truth labels; | |
""" | |
# densepose outputs are computed for all images and all bounding boxes; | |
# i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively, | |
# the outputs will have size(0) == 3+1+2+1 == 7 | |
if not len(proposals_with_gt): | |
return self.produce_fake_densepose_losses(densepose_predictor_outputs) | |
accumulator = ChartBasedAnnotationsAccumulator() | |
packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator) | |
# NOTE: we need to keep the same computation graph on all the GPUs to | |
# perform reduction properly. Hence even if we have no data on one | |
# of the GPUs, we still need to generate the computation graph. | |
# Add fake (zero) loss in the form Tensor.sum() * 0 | |
if packed_annotations is None: | |
return self.produce_fake_densepose_losses(densepose_predictor_outputs) | |
h, w = densepose_predictor_outputs.u.shape[2:] | |
interpolator = BilinearInterpolationHelper.from_matches( | |
packed_annotations, | |
(h, w), | |
) | |
j_valid_fg = interpolator.j_valid * ( # pyre-ignore[16] | |
packed_annotations.fine_segm_labels_gt > 0 | |
) | |
# pyre-fixme[6]: For 1st param expected `Tensor` but got `int`. | |
if not torch.any(j_valid_fg): | |
return self.produce_fake_densepose_losses(densepose_predictor_outputs) | |
losses_uv = self.produce_densepose_losses_uv( | |
proposals_with_gt, | |
densepose_predictor_outputs, | |
packed_annotations, | |
interpolator, | |
j_valid_fg, # pyre-ignore[6] | |
) | |
losses_segm = self.produce_densepose_losses_segm( | |
proposals_with_gt, | |
densepose_predictor_outputs, | |
packed_annotations, | |
interpolator, | |
j_valid_fg, # pyre-ignore[6] | |
) | |
return {**losses_uv, **losses_segm} | |
def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict: | |
""" | |
Fake losses for fine segmentation and U/V coordinates. These are used when | |
no suitable ground truth data was found in a batch. The loss has a value 0 | |
and is primarily used to construct the computation graph, so that | |
`DistributedDataParallel` has similar graphs on all GPUs and can perform | |
reduction properly. | |
Args: | |
densepose_predictor_outputs: DensePose predictor outputs, an object | |
of a dataclass that is assumed to have the following attributes: | |
* fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] | |
* u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
* v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
Return: | |
dict: str -> tensor: dict of losses with the following entries: | |
* `loss_densepose_U`: has value 0 | |
* `loss_densepose_V`: has value 0 | |
* `loss_densepose_I`: has value 0 | |
* `loss_densepose_S`: has value 0 | |
""" | |
losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs) | |
losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs) | |
return {**losses_uv, **losses_segm} | |
def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict: | |
""" | |
Fake losses for U/V coordinates. These are used when no suitable ground | |
truth data was found in a batch. The loss has a value 0 | |
and is primarily used to construct the computation graph, so that | |
`DistributedDataParallel` has similar graphs on all GPUs and can perform | |
reduction properly. | |
Args: | |
densepose_predictor_outputs: DensePose predictor outputs, an object | |
of a dataclass that is assumed to have the following attributes: | |
* u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
* v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
Return: | |
dict: str -> tensor: dict of losses with the following entries: | |
* `loss_densepose_U`: has value 0 | |
* `loss_densepose_V`: has value 0 | |
""" | |
return { | |
"loss_densepose_U": densepose_predictor_outputs.u.sum() * 0, | |
"loss_densepose_V": densepose_predictor_outputs.v.sum() * 0, | |
} | |
def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict: | |
""" | |
Fake losses for fine / coarse segmentation. These are used when | |
no suitable ground truth data was found in a batch. The loss has a value 0 | |
and is primarily used to construct the computation graph, so that | |
`DistributedDataParallel` has similar graphs on all GPUs and can perform | |
reduction properly. | |
Args: | |
densepose_predictor_outputs: DensePose predictor outputs, an object | |
of a dataclass that is assumed to have the following attributes: | |
* fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] | |
* coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] | |
Return: | |
dict: str -> tensor: dict of losses with the following entries: | |
* `loss_densepose_I`: has value 0 | |
* `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False | |
""" | |
losses = { | |
"loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0, | |
"loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs), | |
} | |
return losses | |
def produce_densepose_losses_uv( | |
self, | |
proposals_with_gt: List[Instances], | |
densepose_predictor_outputs: Any, | |
packed_annotations: Any, | |
interpolator: BilinearInterpolationHelper, | |
j_valid_fg: torch.Tensor, | |
) -> LossDict: | |
""" | |
Compute losses for U/V coordinates: smooth L1 loss between | |
estimated coordinates and the ground truth. | |
Args: | |
proposals_with_gt (list of Instances): detections with associated ground truth data | |
densepose_predictor_outputs: DensePose predictor outputs, an object | |
of a dataclass that is assumed to have the following attributes: | |
* u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
* v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] | |
Return: | |
dict: str -> tensor: dict of losses with the following entries: | |
* `loss_densepose_U`: smooth L1 loss for U coordinate estimates | |
* `loss_densepose_V`: smooth L1 loss for V coordinate estimates | |
""" | |
u_gt = packed_annotations.u_gt[j_valid_fg] | |
u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg] | |
v_gt = packed_annotations.v_gt[j_valid_fg] | |
v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg] | |
return { | |
"loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points, | |
"loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points, | |
} | |
def produce_densepose_losses_segm( | |
self, | |
proposals_with_gt: List[Instances], | |
densepose_predictor_outputs: Any, | |
packed_annotations: Any, | |
interpolator: BilinearInterpolationHelper, | |
j_valid_fg: torch.Tensor, | |
) -> LossDict: | |
""" | |
Losses for fine / coarse segmentation: cross-entropy | |
for segmentation unnormalized scores given ground truth labels at | |
annotated points for fine segmentation and dense mask annotations | |
for coarse segmentation. | |
Args: | |
proposals_with_gt (list of Instances): detections with associated ground truth data | |
densepose_predictor_outputs: DensePose predictor outputs, an object | |
of a dataclass that is assumed to have the following attributes: | |
* fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] | |
* coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] | |
Return: | |
dict: str -> tensor: dict of losses with the following entries: | |
* `loss_densepose_I`: cross entropy for raw unnormalized scores for fine | |
segmentation estimates given ground truth labels | |
* `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse | |
segmentation estimates given ground truth labels; | |
may be included if coarse segmentation is only trained | |
using DensePose ground truth; if additional supervision through | |
instance segmentation data is performed (`segm_trained_by_masks` is True), | |
this loss is handled by `produce_mask_losses` instead | |
""" | |
fine_segm_gt = packed_annotations.fine_segm_labels_gt[ | |
interpolator.j_valid # pyre-ignore[16] | |
] | |
fine_segm_est = interpolator.extract_at_points( | |
densepose_predictor_outputs.fine_segm, | |
slice_fine_segm=slice(None), | |
w_ylo_xlo=interpolator.w_ylo_xlo[:, None], # pyre-ignore[16] | |
w_ylo_xhi=interpolator.w_ylo_xhi[:, None], # pyre-ignore[16] | |
w_yhi_xlo=interpolator.w_yhi_xlo[:, None], # pyre-ignore[16] | |
w_yhi_xhi=interpolator.w_yhi_xhi[:, None], # pyre-ignore[16] | |
)[interpolator.j_valid, :] | |
return { | |
"loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part, | |
"loss_densepose_S": self.segm_loss( | |
proposals_with_gt, densepose_predictor_outputs, packed_annotations | |
) | |
* self.w_segm, | |
} | |