Spaces:
Runtime error
Runtime error
# Copyright (c) Facebook, Inc. and its affiliates. | |
import inspect | |
import logging | |
import numpy as np | |
from typing import Dict, List, Optional, Tuple | |
import torch | |
from torch import nn | |
from detectron2.config import configurable | |
from detectron2.layers import ShapeSpec, nonzero_tuple | |
from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou | |
from detectron2.utils.events import get_event_storage | |
from detectron2.utils.registry import Registry | |
from ..backbone.resnet import BottleneckBlock, ResNet | |
from ..matcher import Matcher | |
from ..poolers import ROIPooler | |
from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals | |
from ..sampling import subsample_labels | |
from .box_head import build_box_head | |
from .fast_rcnn import FastRCNNOutputLayers | |
from .keypoint_head import build_keypoint_head | |
from .mask_head import build_mask_head | |
from .roi_heads import ROI_HEADS_REGISTRY, select_foreground_proposals, ROIHeads | |
class CLIPRes5ROIHeads(ROIHeads): | |
""" | |
Created for CLIP ResNet. This head uses the last resnet layer from backbone. | |
The ROIHeads in a typical "C4" R-CNN model, where | |
the box and mask head share the cropping and | |
the per-region feature computation by a Res5 block. | |
See :paper:`ResNet` Appendix A. | |
""" | |
def __init__( | |
self, | |
*, | |
in_features: List[str], | |
pooler: ROIPooler, | |
res5: None, | |
box_predictor: nn.Module, | |
mask_head: Optional[nn.Module] = None, | |
**kwargs, | |
): | |
""" | |
NOTE: this interface is experimental. | |
Args: | |
in_features (list[str]): list of backbone feature map names to use for | |
feature extraction | |
pooler (ROIPooler): pooler to extra region features from backbone | |
res5 (nn.Sequential): a CNN to compute per-region features, to be used by | |
``box_predictor`` and ``mask_head``. Typically this is a "res5" | |
block from a ResNet. | |
box_predictor (nn.Module): make box predictions from the feature. | |
Should have the same interface as :class:`FastRCNNOutputLayers`. | |
mask_head (nn.Module): transform features to make mask predictions | |
""" | |
super().__init__(**kwargs) | |
self.in_features = in_features | |
self.pooler = pooler | |
# if isinstance(res5, (list, tuple)): | |
# res5 = nn.Sequential(*res5) | |
self.res5 = res5 # None, this head uses the res5 from backbone | |
self.box_predictor = box_predictor | |
self.mask_on = mask_head is not None | |
if self.mask_on: | |
self.mask_head = mask_head | |
def from_config(cls, cfg, input_shape): | |
# fmt: off | |
ret = super().from_config(cfg) | |
in_features = ret["in_features"] = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION | |
pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE | |
pooler_scales = (1.0 / input_shape[in_features[0]].stride, ) | |
sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO | |
mask_on = cfg.MODEL.MASK_ON | |
# fmt: on | |
assert not cfg.MODEL.KEYPOINT_ON | |
assert len(in_features) == 1 | |
ret["pooler"] = ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
# Compatbility with old moco code. Might be useful. | |
# See notes in StandardROIHeads.from_config | |
# if not inspect.ismethod(cls._build_res5_block): | |
# logger.warning( | |
# "The behavior of _build_res5_block may change. " | |
# "Please do not depend on private methods." | |
# ) | |
# cls._build_res5_block = classmethod(cls._build_res5_block) | |
ret["res5"], out_channels = None, cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 8 # cls._build_res5_block(cfg) | |
ret["box_predictor"] = FastRCNNOutputLayers( | |
cfg, ShapeSpec(channels=out_channels, height=1, width=1) | |
) | |
if mask_on: | |
ret["mask_head"] = build_mask_head( | |
cfg, | |
ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), | |
) | |
return ret | |
def _shared_roi_transform(self, features, boxes, backbone_res5): | |
x = self.pooler(features, boxes) | |
return backbone_res5(x) | |
def forward(self, images, features, proposals, queries, targets=None, | |
res5=None, ds=None, norm=None, vision_projection=None, attnpool=None): | |
""" | |
See :meth:`ROIHeads.forward`. | |
""" | |
del images | |
if self.training: | |
assert targets | |
proposals = self.label_and_sample_proposals(proposals, targets) | |
del targets | |
proposal_boxes = [x.proposal_boxes for x in proposals] | |
box_features = self._shared_roi_transform( | |
[features[f] for f in self.in_features], proposal_boxes, res5 | |
) | |
if attnpool: # att pooling | |
att_feats = attnpool(box_features) | |
predictions = self.box_predictor(att_feats, queries) | |
else: # mean pooling | |
predictions = self.box_predictor(box_features.mean(dim=[2, 3])) | |
if self.training: | |
del features | |
losses = self.box_predictor.losses(predictions, proposals) | |
if self.mask_on: | |
proposals, fg_selection_masks = select_foreground_proposals( | |
proposals, self.num_classes | |
) | |
# Since the ROI feature transform is shared between boxes and masks, | |
# we don't need to recompute features. The mask loss is only defined | |
# on foreground proposals, so we need to select out the foreground | |
# features. | |
mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] | |
del box_features | |
losses.update(self.mask_head(mask_features, proposals)) | |
return [], losses | |
else: | |
pred_instances, _ = self.box_predictor.inference(predictions, proposals) | |
pred_instances = self.forward_with_given_boxes(features, pred_instances, res5) | |
return pred_instances, {} | |
def forward_with_given_boxes(self, features, instances, res5=None): | |
""" | |
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. | |
Args: | |
features: same as in `forward()` | |
instances (list[Instances]): instances to predict other outputs. Expect the keys | |
"pred_boxes" and "pred_classes" to exist. | |
Returns: | |
instances (Instances): | |
the same `Instances` object, with extra | |
fields such as `pred_masks` or `pred_keypoints`. | |
""" | |
assert not self.training | |
assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") | |
if self.mask_on: | |
features = [features[f] for f in self.in_features] | |
x = self._shared_roi_transform(features, [x.pred_boxes for x in instances], res5) | |
return self.mask_head(x, instances) | |
else: | |
return instances | |
class CLIPSwinROIHeads(ROIHeads): | |
""" | |
Created for CLIP ResNet. This head uses the last resnet layer from backbone. | |
The ROIHeads in a typical "C4" R-CNN model, where | |
the box and mask head share the cropping and | |
the per-region feature computation by a Res5 block. | |
See :paper:`ResNet` Appendix A. | |
""" | |
def __init__( | |
self, | |
*, | |
in_features: List[str], | |
pooler: ROIPooler, | |
res5: None, | |
box_predictor: nn.Module, | |
mask_head: Optional[nn.Module] = None, | |
**kwargs, | |
): | |
""" | |
NOTE: this interface is experimental. | |
Args: | |
in_features (list[str]): list of backbone feature map names to use for | |
feature extraction | |
pooler (ROIPooler): pooler to extra region features from backbone | |
res5 (nn.Sequential): a CNN to compute per-region features, to be used by | |
``box_predictor`` and ``mask_head``. Typically this is a "res5" | |
block from a ResNet. | |
box_predictor (nn.Module): make box predictions from the feature. | |
Should have the same interface as :class:`FastRCNNOutputLayers`. | |
mask_head (nn.Module): transform features to make mask predictions | |
""" | |
super().__init__(**kwargs) | |
self.in_features = in_features | |
self.pooler = pooler | |
# if isinstance(res5, (list, tuple)): | |
# res5 = nn.Sequential(*res5) | |
self.res5 = res5 # None, this head uses the res5 from backbone | |
self.box_predictor = box_predictor | |
self.mask_on = mask_head is not None | |
if self.mask_on: | |
self.mask_head = mask_head | |
def from_config(cls, cfg, input_shape): | |
# fmt: off | |
ret = super().from_config(cfg) | |
in_features = ret["in_features"] = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION | |
pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE | |
pooler_scales = (1.0 / input_shape[in_features[0]].stride, ) | |
sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO | |
mask_on = cfg.MODEL.MASK_ON | |
# fmt: on | |
assert not cfg.MODEL.KEYPOINT_ON | |
assert len(in_features) == 1 | |
ret["pooler"] = ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
# Compatbility with old moco code. Might be useful. | |
# See notes in StandardROIHeads.from_config | |
# if not inspect.ismethod(cls._build_res5_block): | |
# logger.warning( | |
# "The behavior of _build_res5_block may change. " | |
# "Please do not depend on private methods." | |
# ) | |
# cls._build_res5_block = classmethod(cls._build_res5_block) | |
ret["res5"], out_channels = None, cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 8 # cls._build_res5_block(cfg) | |
ret["box_predictor"] = FastRCNNOutputLayers( | |
cfg, ShapeSpec(channels=out_channels, height=1, width=1) | |
) | |
if mask_on: | |
ret["mask_head"] = build_mask_head( | |
cfg, | |
ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), | |
) | |
return ret | |
def _shared_roi_transform(self, features, boxes, backbone_res5, backbone_ds): | |
x = self.pooler(features, boxes) | |
if backbone_ds: | |
x_flattened = x.flatten(2).transpose(1, 2) | |
x_ds = backbone_ds(x_flattened, x.shape[2], x.shape[3]) | |
return backbone_res5(x_ds, x.shape[2] // 2, x.shape[3] // 2) | |
else: | |
return backbone_res5(x) | |
def forward(self, images, features, proposals, queries, targets=None, | |
res5=None, ds=None, norm=None, vision_projection=None, attnpool=None): | |
""" | |
See :meth:`ROIHeads.forward`. | |
""" | |
del images | |
if self.training: | |
assert targets | |
proposals = self.label_and_sample_proposals(proposals, targets) | |
del targets | |
proposal_boxes = [x.proposal_boxes for x in proposals] | |
box_features = self._shared_roi_transform( | |
[features[f] for f in self.in_features], proposal_boxes, res5, ds, | |
) | |
if isinstance(box_features, tuple): | |
box_features = norm(box_features[0]).mean(1) | |
box_features = box_features @ vision_projection | |
box_features = box_features / box_features.norm(dim=-1, keepdim=True) | |
if attnpool: # att pooling | |
att_feats = attnpool(box_features) | |
predictions = self.box_predictor(att_feats) | |
else: # mean pooling | |
predictions = self.box_predictor(box_features, queries) | |
if self.training: | |
del features | |
losses = self.box_predictor.losses(predictions, proposals) | |
if self.mask_on: | |
proposals, fg_selection_masks = select_foreground_proposals( | |
proposals, self.num_classes | |
) | |
# Since the ROI feature transform is shared between boxes and masks, | |
# we don't need to recompute features. The mask loss is only defined | |
# on foreground proposals, so we need to select out the foreground | |
# features. | |
mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] | |
del box_features | |
losses.update(self.mask_head(mask_features, proposals)) | |
return [], losses | |
else: | |
pred_instances, _ = self.box_predictor.inference(predictions, proposals) | |
# pred_instances = self.forward_with_given_boxes(features, pred_instances, res5) | |
return pred_instances, {} | |
def forward_with_given_boxes(self, features, instances, res5=None): | |
""" | |
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. | |
Args: | |
features: same as in `forward()` | |
instances (list[Instances]): instances to predict other outputs. Expect the keys | |
"pred_boxes" and "pred_classes" to exist. | |
Returns: | |
instances (Instances): | |
the same `Instances` object, with extra | |
fields such as `pred_masks` or `pred_keypoints`. | |
""" | |
assert not self.training | |
assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") | |
if self.mask_on: | |
features = [features[f] for f in self.in_features] | |
x = self._shared_roi_transform(features, [x.pred_boxes for x in instances], res5) | |
return self.mask_head(x, instances) | |
else: | |
return instances | |
class PretrainRes5ROIHeads(ROIHeads): | |
""" | |
Created for pretraining CLIP ResNet without box_predictor. This head uses the last resnet layer from backbone. | |
The ROIHeads in a typical "C4" R-CNN model, where | |
the box and mask head share the cropping and | |
the per-region feature computation by a Res5 block. | |
See :paper:`ResNet` Appendix A. | |
""" | |
def __init__( | |
self, | |
*, | |
in_features: List[str], | |
pooler: ROIPooler, | |
res5: None, | |
box_predictor: Optional[nn.Module] = None, | |
mask_head: Optional[nn.Module] = None, | |
**kwargs, | |
): | |
""" | |
NOTE: this interface is experimental. | |
Args: | |
in_features (list[str]): list of backbone feature map names to use for | |
feature extraction | |
pooler (ROIPooler): pooler to extra region features from backbone | |
res5 (nn.Sequential): a CNN to compute per-region features, to be used by | |
``box_predictor`` and ``mask_head``. Typically this is a "res5" | |
block from a ResNet. | |
box_predictor (nn.Module): make box predictions from the feature. | |
Should have the same interface as :class:`FastRCNNOutputLayers`. | |
mask_head (nn.Module): transform features to make mask predictions | |
""" | |
super().__init__(**kwargs) | |
self.in_features = in_features | |
self.pooler = pooler | |
# if isinstance(res5, (list, tuple)): | |
# res5 = nn.Sequential(*res5) | |
self.res5 = res5 # None, this head uses the res5 from backbone | |
self.box_predictor = None | |
self.mask_on = None | |
def from_config(cls, cfg, input_shape): | |
# fmt: off | |
ret = super().from_config(cfg) | |
in_features = ret["in_features"] = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION | |
pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE | |
pooler_scales = (1.0 / input_shape[in_features[0]].stride, ) | |
sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO | |
mask_on = cfg.MODEL.MASK_ON | |
# fmt: on | |
assert not cfg.MODEL.KEYPOINT_ON | |
assert len(in_features) == 1 | |
ret["pooler"] = ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
ret["res5"], out_channels = None, cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 8 # cls._build_res5_block(cfg) | |
ret["box_predictor"] = None | |
ret["mask_head"] = None | |
return ret | |
def _shared_roi_transform(self, features, boxes, backbone_res5, backbone_ds): | |
x = self.pooler(features, boxes) | |
if backbone_ds: | |
return backbone_res5(backbone_ds(x)) | |
else: | |
return backbone_res5(x) | |
def forward(self, images, features, proposals, targets=None, res5=None, ds=None, attnpool=None): | |
""" | |
See :meth:`ROIHeads.forward`. | |
""" | |
# if self.training: | |
# assert targets | |
# proposals = self.label_and_sample_proposals(proposals, targets) | |
# del targets | |
if isinstance(proposals[0], Boxes): # grid boxes | |
proposal_boxes = proposals | |
else: # object proposals | |
proposal_boxes = [x.proposal_boxes for x in proposals] | |
box_features = self._shared_roi_transform( | |
[features[f] for f in self.in_features], proposal_boxes, res5 | |
) | |
if attnpool: # att pooling | |
att_feats = attnpool(box_features) | |
region_feats = att_feats # self.box_predictor(att_feats) | |
else: # mean pooling | |
region_feats = box_features.mean(dim=[2, 3]) # self.box_predictor(box_features.mean(dim=[2, 3])) | |
return region_feats | |
def forward_with_given_boxes(self, features, instances, res5=None): | |
""" | |
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. | |
Args: | |
features: same as in `forward()` | |
instances (list[Instances]): instances to predict other outputs. Expect the keys | |
"pred_boxes" and "pred_classes" to exist. | |
Returns: | |
instances (Instances): | |
the same `Instances` object, with extra | |
fields such as `pred_masks` or `pred_keypoints`. | |
""" | |
assert not self.training | |
assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") | |
return instances | |
class CLIPStandardROIHeads(ROIHeads): | |
""" | |
Created for CLIP ResNet. This head uses the attention pool layers from backbone. | |
It's "standard" in a sense that there is no ROI transform sharing | |
or feature sharing between tasks. | |
Each head independently processes the input features by each head's | |
own pooler and head. | |
This class is used by most models, such as FPN and C5. | |
To implement more models, you can subclass it and implement a different | |
:meth:`forward()` or a head. | |
""" | |
def __init__( | |
self, | |
*, | |
box_in_features: List[str], | |
box_pooler: ROIPooler, | |
box_head: nn.Module, | |
box_predictor: nn.Module, | |
mask_in_features: Optional[List[str]] = None, | |
mask_pooler: Optional[ROIPooler] = None, | |
mask_head: Optional[nn.Module] = None, | |
train_on_pred_boxes: bool = False, | |
**kwargs, | |
): | |
""" | |
NOTE: this interface is experimental. | |
Args: | |
box_in_features (list[str]): list of feature names to use for the box head. | |
box_pooler (ROIPooler): pooler to extra region features for box head | |
box_head (nn.Module): transform features to make box predictions | |
box_predictor (nn.Module): make box predictions from the feature. | |
Should have the same interface as :class:`FastRCNNOutputLayers`. | |
mask_in_features (list[str]): list of feature names to use for the mask | |
pooler or mask head. None if not using mask head. | |
mask_pooler (ROIPooler): pooler to extract region features from image features. | |
The mask head will then take region features to make predictions. | |
If None, the mask head will directly take the dict of image features | |
defined by `mask_in_features` | |
mask_head (nn.Module): transform features to make mask predictions | |
keypoint_in_features, keypoint_pooler, keypoint_head: similar to ``mask_*``. | |
train_on_pred_boxes (bool): whether to use proposal boxes or | |
predicted boxes from the box head to train other heads. | |
""" | |
super().__init__(**kwargs) | |
# keep self.in_features for backward compatibility | |
self.in_features = self.box_in_features = box_in_features | |
self.box_pooler = box_pooler | |
self.box_head = box_head | |
self.box_predictor = box_predictor | |
self.mask_on = mask_in_features is not None | |
if self.mask_on: | |
self.mask_in_features = mask_in_features | |
self.mask_pooler = mask_pooler | |
self.mask_head = mask_head | |
self.train_on_pred_boxes = train_on_pred_boxes | |
def from_config(cls, cfg, input_shape): | |
ret = super().from_config(cfg) | |
ret["train_on_pred_boxes"] = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES | |
# Subclasses that have not been updated to use from_config style construction | |
# may have overridden _init_*_head methods. In this case, those overridden methods | |
# will not be classmethods and we need to avoid trying to call them here. | |
# We test for this with ismethod which only returns True for bound methods of cls. | |
# Such subclasses will need to handle calling their overridden _init_*_head methods. | |
if inspect.ismethod(cls._init_box_head): | |
ret.update(cls._init_box_head(cfg, input_shape)) | |
if inspect.ismethod(cls._init_mask_head): | |
ret.update(cls._init_mask_head(cfg, input_shape)) | |
return ret | |
def _init_box_head(cls, cfg, input_shape): | |
# fmt: off | |
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION | |
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) | |
sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO | |
pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE | |
# fmt: on | |
# If StandardROIHeads is applied on multiple feature maps (as in FPN), | |
# then we share the same predictors and therefore the channel counts must be the same | |
in_channels = [input_shape[f].channels for f in in_features] | |
# Check all channel counts are equal | |
assert len(set(in_channels)) == 1, in_channels | |
in_channels = in_channels[0] | |
box_pooler = ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
# Here we split "box head" and "box predictor", which is mainly due to historical reasons. | |
# They are used together so the "box predictor" layers should be part of the "box head". | |
# New subclasses of ROIHeads do not need "box predictor"s. | |
box_head = None if cfg.MODEL.CLIP.USE_TEXT_EMB_CLASSIFIER else build_box_head( | |
cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) | |
) | |
box_head_output_shape = 1024 | |
box_predictor = FastRCNNOutputLayers(cfg, box_head_output_shape) | |
return { | |
"box_in_features": in_features, | |
"box_pooler": box_pooler, | |
"box_head": box_head, | |
"box_predictor": box_predictor, | |
} | |
def _init_mask_head(cls, cfg, input_shape): | |
if not cfg.MODEL.MASK_ON: | |
return {} | |
# fmt: off | |
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION | |
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) | |
sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO | |
pooler_type = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE | |
# fmt: on | |
in_channels = [input_shape[f].channels for f in in_features][0] | |
ret = {"mask_in_features": in_features} | |
ret["mask_pooler"] = ( | |
ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
if pooler_type | |
else None | |
) | |
if pooler_type: | |
shape = ShapeSpec( | |
channels=in_channels, width=pooler_resolution, height=pooler_resolution | |
) | |
else: | |
shape = {f: input_shape[f] for f in in_features} | |
ret["mask_head"] = build_mask_head(cfg, shape) | |
return ret | |
def forward( | |
self, | |
images: ImageList, | |
features: Dict[str, torch.Tensor], | |
proposals: List[Instances], | |
targets: Optional[List[Instances]] = None, | |
attnpool=None, | |
) -> Tuple[List[Instances], Dict[str, torch.Tensor]]: | |
""" | |
See :class:`ROIHeads.forward`. | |
""" | |
del images | |
if self.training: | |
assert targets, "'targets' argument is required during training" | |
proposals = self.label_and_sample_proposals(proposals, targets) | |
del targets | |
if self.training: | |
losses = self._forward_box(features, proposals, attnpool=attnpool) | |
# Usually the original proposals used by the box head are used by the mask, keypoint | |
# heads. But when `self.train_on_pred_boxes is True`, proposals will contain boxes | |
# predicted by the box head. | |
losses.update(self._forward_mask(features, proposals)) | |
return proposals, losses | |
else: | |
pred_instances = self._forward_box(features, proposals, attnpool=attnpool) | |
# During inference cascaded prediction is used: the mask and keypoints heads are only | |
# applied to the top scoring box detections. | |
pred_instances = self.forward_with_given_boxes(features, pred_instances) | |
return pred_instances, {} | |
def forward_with_given_boxes( | |
self, features: Dict[str, torch.Tensor], instances: List[Instances] | |
) -> List[Instances]: | |
""" | |
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. | |
This is useful for downstream tasks where a box is known, but need to obtain | |
other attributes (outputs of other heads). | |
Test-time augmentation also uses this. | |
Args: | |
features: same as in `forward()` | |
instances (list[Instances]): instances to predict other outputs. Expect the keys | |
"pred_boxes" and "pred_classes" to exist. | |
Returns: | |
list[Instances]: | |
the same `Instances` objects, with extra | |
fields such as `pred_masks` or `pred_keypoints`. | |
""" | |
assert not self.training | |
assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") | |
instances = self._forward_mask(features, instances) | |
return instances | |
def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances], attnpool=None): | |
""" | |
Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, | |
the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. | |
Args: | |
features (dict[str, Tensor]): mapping from feature map names to tensor. | |
Same as in :meth:`ROIHeads.forward`. | |
proposals (list[Instances]): the per-image object proposals with | |
their matching ground truth. | |
Each has fields "proposal_boxes", and "objectness_logits", | |
"gt_classes", "gt_boxes". | |
Returns: | |
In training, a dict of losses. | |
In inference, a list of `Instances`, the predicted instances. | |
""" | |
features = [features[f] for f in self.box_in_features] | |
box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) | |
if attnpool: # att pooling | |
box_features = attnpool(box_features) | |
else: # default FPN pooling (FastRCNNConvFCHead) | |
box_features = self.box_head(box_features) | |
predictions = self.box_predictor(box_features) | |
del box_features | |
if self.training: | |
losses = self.box_predictor.losses(predictions, proposals) | |
# proposals is modified in-place below, so losses must be computed first. | |
if self.train_on_pred_boxes: | |
with torch.no_grad(): | |
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( | |
predictions, proposals | |
) | |
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): | |
proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) | |
return losses | |
else: | |
pred_instances, _ = self.box_predictor.inference(predictions, proposals) | |
return pred_instances | |
def _forward_mask(self, features: Dict[str, torch.Tensor], instances: List[Instances]): | |
""" | |
Forward logic of the mask prediction branch. | |
Args: | |
features (dict[str, Tensor]): mapping from feature map names to tensor. | |
Same as in :meth:`ROIHeads.forward`. | |
instances (list[Instances]): the per-image instances to train/predict masks. | |
In training, they can be the proposals. | |
In inference, they can be the boxes predicted by R-CNN box head. | |
Returns: | |
In training, a dict of losses. | |
In inference, update `instances` with new fields "pred_masks" and return it. | |
""" | |
if not self.mask_on: | |
return {} if self.training else instances | |
if self.training: | |
# head is only trained on positive proposals. | |
instances, _ = select_foreground_proposals(instances, self.num_classes) | |
if self.mask_pooler is not None: | |
features = [features[f] for f in self.mask_in_features] | |
boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances] | |
features = self.mask_pooler(features, boxes) | |
else: | |
features = {f: features[f] for f in self.mask_in_features} | |
return self.mask_head(features, instances) |