Source code for monai.apps.detection.networks.retinanet_network

# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# =========================================================================
# Adapted from https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py
# which has the following license...
# https://github.com/pytorch/vision/blob/main/LICENSE

# BSD 3-Clause License

# Copyright (c) Soumith Chintala 2016,
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.

# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.

# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
"""
Part of this script is adapted from
https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py
"""

from __future__ import annotations

import math
import warnings
from collections.abc import Callable, Sequence
from typing import Any, Dict

import torch
from torch import Tensor, nn

from monai.networks.blocks.backbone_fpn_utils import BackboneWithFPN, _resnet_fpn_extractor
from monai.networks.layers.factories import Conv
from monai.networks.nets import resnet
from monai.utils import ensure_tuple_rep, look_up_option, optional_import

_validate_trainable_layers, _ = optional_import(
    "torchvision.models.detection.backbone_utils", name="_validate_trainable_layers"
)



[docs]
class RetinaNetClassificationHead(nn.Module):
    """
    A classification head for use in RetinaNet.

    This head takes a list of feature maps as inputs, and outputs a list of classification maps.
    Each output map has same spatial size with the corresponding input feature map,
    and the number of output channel is num_anchors * num_classes.

    Args:
        in_channels: number of channels of the input feature
        num_anchors: number of anchors to be predicted
        num_classes: number of classes to be predicted
        spatial_dims: spatial dimension of the network, should be 2 or 3.
        prior_probability: prior probability to initialize classification convolutional layers.
    """

    def __init__(
        self, in_channels: int, num_anchors: int, num_classes: int, spatial_dims: int, prior_probability: float = 0.01
    ):
        super().__init__()

        conv_type: Callable = Conv[Conv.CONV, spatial_dims]
        conv = []
        for _ in range(4):
            conv.append(conv_type(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
            conv.append(nn.GroupNorm(num_groups=8, num_channels=in_channels))
            conv.append(nn.ReLU())
        self.conv = nn.Sequential(*conv)

        for layer in self.conv.children():
            if isinstance(layer, conv_type):  # type: ignore
                torch.nn.init.normal_(layer.weight, std=0.01)
                torch.nn.init.constant_(layer.bias, 0)

        self.cls_logits = conv_type(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
        torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
        torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))

        self.num_classes = num_classes
        self.num_anchors = num_anchors


[docs]
    def forward(self, x: list[Tensor]) -> list[Tensor]:
        """
        It takes a list of feature maps as inputs, and outputs a list of classification maps.
        Each output classification map has same spatial size with the corresponding input feature map,
        and the number of output channel is num_anchors * num_classes.

        Args:
            x: list of feature map, x[i] is a (B, in_channels, H_i, W_i) or (B, in_channels, H_i, W_i, D_i) Tensor.

        Return:
            cls_logits_maps, list of classification map. cls_logits_maps[i] is a
            (B, num_anchors * num_classes, H_i, W_i) or (B, num_anchors * num_classes, H_i, W_i, D_i) Tensor.

        """
        cls_logits_maps = []

        if isinstance(x, Tensor):
            feature_maps = [x]
        else:
            feature_maps = x

        for features in feature_maps:
            cls_logits = self.conv(features)
            cls_logits = self.cls_logits(cls_logits)

            cls_logits_maps.append(cls_logits)

            if torch.isnan(cls_logits).any() or torch.isinf(cls_logits).any():
                if torch.is_grad_enabled():
                    raise ValueError("cls_logits is NaN or Inf.")
                else:
                    warnings.warn("cls_logits is NaN or Inf.")

        return cls_logits_maps





[docs]
class RetinaNetRegressionHead(nn.Module):
    """
    A regression head for use in RetinaNet.

    This head takes a list of feature maps as inputs, and outputs a list of box regression maps.
    Each output box regression map has same spatial size with the corresponding input feature map,
    and the number of output channel is num_anchors * 2 * spatial_dims.

    Args:
        in_channels: number of channels of the input feature
        num_anchors: number of anchors to be predicted
        spatial_dims: spatial dimension of the network, should be 2 or 3.
    """

    def __init__(self, in_channels: int, num_anchors: int, spatial_dims: int):
        super().__init__()

        conv_type: Callable = Conv[Conv.CONV, spatial_dims]

        conv = []
        for _ in range(4):
            conv.append(conv_type(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
            conv.append(nn.GroupNorm(num_groups=8, num_channels=in_channels))
            conv.append(nn.ReLU())

        self.conv = nn.Sequential(*conv)

        self.bbox_reg = conv_type(in_channels, num_anchors * 2 * spatial_dims, kernel_size=3, stride=1, padding=1)
        torch.nn.init.normal_(self.bbox_reg.weight, std=0.01)
        torch.nn.init.zeros_(self.bbox_reg.bias)

        for layer in self.conv.children():
            if isinstance(layer, conv_type):  # type: ignore
                torch.nn.init.normal_(layer.weight, std=0.01)
                torch.nn.init.zeros_(layer.bias)


[docs]
    def forward(self, x: list[Tensor]) -> list[Tensor]:
        """
        It takes a list of feature maps as inputs, and outputs a list of box regression maps.
        Each output box regression map has same spatial size with the corresponding input feature map,
        and the number of output channel is num_anchors * 2 * spatial_dims.

        Args:
            x: list of feature map, x[i] is a (B, in_channels, H_i, W_i) or (B, in_channels, H_i, W_i, D_i) Tensor.

        Return:
            box_regression_maps, list of box regression map. cls_logits_maps[i] is a
            (B, num_anchors * 2 * spatial_dims, H_i, W_i) or (B, num_anchors * 2 * spatial_dims, H_i, W_i, D_i) Tensor.

        """
        box_regression_maps = []

        if isinstance(x, Tensor):
            feature_maps = [x]
        else:
            feature_maps = x

        for features in feature_maps:
            box_regression = self.conv(features)
            box_regression = self.bbox_reg(box_regression)

            box_regression_maps.append(box_regression)

            if torch.isnan(box_regression).any() or torch.isinf(box_regression).any():
                if torch.is_grad_enabled():
                    raise ValueError("box_regression is NaN or Inf.")
                else:
                    warnings.warn("box_regression is NaN or Inf.")

        return box_regression_maps





[docs]
class RetinaNet(nn.Module):
    """
    The network used in RetinaNet.

    It takes an image tensor as inputs, and outputs either 1) a dictionary ``head_outputs``.
    ``head_outputs[self.cls_key]`` is the predicted classification maps, a list of Tensor.
    ``head_outputs[self.box_reg_key]`` is the predicted box regression maps, a list of Tensor.
    or 2) a list of 2N tensors ``head_outputs``, with first N tensors being the predicted
    classification maps and second N tensors being the predicted box regression maps.

    Args:
        spatial_dims: number of spatial dimensions of the images. We support both 2D and 3D images.
        num_classes: number of output classes of the model (excluding the background).
        num_anchors: number of anchors at each location.
        feature_extractor: a network that outputs feature maps from the input images,
            each feature map corresponds to a different resolution.
            Its output can have a format of Tensor, Dict[Any, Tensor], or Sequence[Tensor].
            It can be the output of ``resnet_fpn_feature_extractor(*args, **kwargs)``.
        size_divisible: the spatial size of the network input should be divisible by size_divisible,
            decided by the feature_extractor.
        use_list_output: default False. If False, the network outputs a dictionary ``head_outputs``,
            ``head_outputs[self.cls_key]`` is the predicted classification maps, a list of Tensor.
            ``head_outputs[self.box_reg_key]`` is the predicted box regression maps, a list of Tensor.
            If True, the network outputs a list of 2N tensors ``head_outputs``, with first N tensors being
            the predicted classification maps and second N tensors being the predicted box regression maps.

    Example:

        .. code-block:: python

            from monai.networks.nets import resnet
            spatial_dims = 3  # 3D network
            conv1_t_stride = (2,2,1)  # stride of first convolutional layer in backbone
            backbone = resnet.ResNet(
                spatial_dims = spatial_dims,
                block = resnet.ResNetBottleneck,
                layers = [3, 4, 6, 3],
                block_inplanes = resnet.get_inplanes(),
                n_input_channels= 1,
                conv1_t_stride = conv1_t_stride,
                conv1_t_size = (7,7,7),
            )
            # This feature_extractor outputs 4-level feature maps.
            # number of output feature maps is len(returned_layers)+1
            returned_layers = [1,2,3]  # returned layer from feature pyramid network
            feature_extractor = resnet_fpn_feature_extractor(
                backbone = backbone,
                spatial_dims = spatial_dims,
                pretrained_backbone = False,
                trainable_backbone_layers = None,
                returned_layers = returned_layers,
            )
            # This feature_extractor requires input image spatial size
            # to be divisible by (32, 32, 16).
            size_divisible = tuple(2*s*2**max(returned_layers) for s in conv1_t_stride)
            model = RetinaNet(
                spatial_dims = spatial_dims,
                num_classes = 5,
                num_anchors = 6,
                feature_extractor=feature_extractor,
                size_divisible = size_divisible,
            ).to(device)
            result = model(torch.rand(2, 1, 128,128,128))
            cls_logits_maps = result["classification"]  # a list of len(returned_layers)+1 Tensor
            box_regression_maps = result["box_regression"]  # a list of len(returned_layers)+1 Tensor
    """

    def __init__(
        self,
        spatial_dims: int,
        num_classes: int,
        num_anchors: int,
        feature_extractor: nn.Module,
        size_divisible: Sequence[int] | int = 1,
        use_list_output: bool = False,
    ):
        super().__init__()

        self.spatial_dims = look_up_option(spatial_dims, supported=[1, 2, 3])
        self.num_classes = num_classes
        self.size_divisible = ensure_tuple_rep(size_divisible, self.spatial_dims)
        self.use_list_output = use_list_output

        if not hasattr(feature_extractor, "out_channels"):
            raise ValueError(
                "feature_extractor should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)"
            )
        self.feature_extractor = feature_extractor

        self.feature_map_channels: int = self.feature_extractor.out_channels
        self.num_anchors = num_anchors
        self.classification_head = RetinaNetClassificationHead(
            self.feature_map_channels, self.num_anchors, self.num_classes, spatial_dims=self.spatial_dims
        )
        self.regression_head = RetinaNetRegressionHead(
            self.feature_map_channels, self.num_anchors, spatial_dims=self.spatial_dims
        )

        self.cls_key: str = "classification"
        self.box_reg_key: str = "box_regression"


[docs]
    def forward(self, images: Tensor) -> Any:
        """
        It takes an image tensor as inputs, and outputs predicted classification maps
        and predicted box regression maps in ``head_outputs``.

        Args:
            images: input images, sized (B, img_channels, H, W) or (B, img_channels, H, W, D).

        Return:
            1) If self.use_list_output is False, output a dictionary ``head_outputs`` with
            keys including self.cls_key and self.box_reg_key.
            ``head_outputs[self.cls_key]`` is the predicted classification maps, a list of Tensor.
            ``head_outputs[self.box_reg_key]`` is the predicted box regression maps, a list of Tensor.
            2) if self.use_list_output is True, outputs a list of 2N tensors ``head_outputs``, with first N tensors being
            the predicted classification maps and second N tensors being the predicted box regression maps.

        """
        # compute features maps list from the input images.
        features = self.feature_extractor(images)
        if isinstance(features, Tensor):
            feature_maps = [features]
        elif torch.jit.isinstance(features, Dict[str, Tensor]):
            feature_maps = list(features.values())
        else:
            feature_maps = list(features)

        if not isinstance(feature_maps[0], Tensor):
            raise ValueError("feature_extractor output format must be Tensor, Dict[str, Tensor], or Sequence[Tensor].")

        # compute classification and box regression maps from the feature maps
        # expandable for mask prediction in the future

        if not self.use_list_output:
            # output dict
            head_outputs = {self.cls_key: self.classification_head(feature_maps)}
            head_outputs[self.box_reg_key] = self.regression_head(feature_maps)
            return head_outputs
        else:
            # output list of tensor, first half is classification, second half is box regression
            head_outputs_sequence = self.classification_head(feature_maps) + self.regression_head(feature_maps)
            return head_outputs_sequence





[docs]
def resnet_fpn_feature_extractor(
    backbone: resnet.ResNet,
    spatial_dims: int,
    pretrained_backbone: bool = False,
    returned_layers: Sequence[int] = (1, 2, 3),
    trainable_backbone_layers: int | None = None,
) -> BackboneWithFPN:
    """
    Constructs a feature extractor network with a ResNet-FPN backbone, used as feature_extractor in RetinaNet.

    Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.

    The returned feature_extractor network takes an image tensor as inputs,
    and outputs a dictionary that maps string to the extracted feature maps (Tensor).

    The input to the returned feature_extractor is expected to be a list of tensors,
    each of shape ``[C, H, W]`` or ``[C, H, W, D]``,
    one for each image. Different images can have different sizes.


    Args:
        backbone: a ResNet model, used as backbone.
        spatial_dims: number of spatial dimensions of the images. We support both 2D and 3D images.
        pretrained_backbone: whether the backbone has been pre-trained.
        returned_layers: returned layers to extract feature maps. Each returned layer should be in the range [1,4].
            len(returned_layers)+1 will be the number of extracted feature maps.
            There is an extra maxpooling layer LastLevelMaxPool() appended.
        trainable_backbone_layers: number of trainable (not frozen) resnet layers starting from final block.
            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
            When pretrained_backbone is False, this value is set to be 5.
            When pretrained_backbone is True, if ``None`` is passed (the default) this value is set to 3.

    Example:

        .. code-block:: python

            from monai.networks.nets import resnet
            spatial_dims = 3 # 3D network
            backbone = resnet.ResNet(
                spatial_dims = spatial_dims,
                block = resnet.ResNetBottleneck,
                layers = [3, 4, 6, 3],
                block_inplanes = resnet.get_inplanes(),
                n_input_channels= 1,
                conv1_t_stride = (2,2,1),
                conv1_t_size = (7,7,7),
            )
            # This feature_extractor outputs 4-level feature maps.
            # number of output feature maps is len(returned_layers)+1
            feature_extractor = resnet_fpn_feature_extractor(
                backbone = backbone,
                spatial_dims = spatial_dims,
                pretrained_backbone = False,
                trainable_backbone_layers = None,
                returned_layers = [1,2,3],
            )
            model = RetinaNet(
                spatial_dims = spatial_dims,
                num_classes = 5,
                num_anchors = 6,
                feature_extractor=feature_extractor,
                size_divisible = 32,
            ).to(device)
    """
    # If pretrained_backbone is False, valid_trainable_backbone_layers = 5.
    # If pretrained_backbone is True, valid_trainable_backbone_layers = trainable_backbone_layers or 3 if None.
    valid_trainable_backbone_layers: int = _validate_trainable_layers(
        pretrained_backbone, trainable_backbone_layers, max_value=5, default_value=3
    )

    feature_extractor = _resnet_fpn_extractor(
        backbone,
        spatial_dims,
        valid_trainable_backbone_layers,
        returned_layers=list(returned_layers),
        extra_blocks=None,
    )
    return feature_extractor