# Source code for monai.apps.detection.utils.anchor_utils

```
# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========================================================================
# Adapted from https://github.com/pytorch/vision/blob/release/0.12/torchvision/models/detection/anchor_utils.py
# which has the following license...
# https://github.com/pytorch/vision/blob/main/LICENSE
# BSD 3-Clause License
# Copyright (c) Soumith Chintala 2016,
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
"""
This script is adapted from
https://github.com/pytorch/vision/blob/release/0.12/torchvision/models/detection/anchor_utils.py
"""
from __future__ import annotations
from typing import List, Sequence
import torch
from torch import Tensor, nn
from monai.utils import ensure_tuple
from monai.utils.misc import issequenceiterable
from monai.utils.module import look_up_option
[docs]
class AnchorGenerator(nn.Module):
"""
This module is modified from torchvision to support both 2D and 3D images.
Module that generates anchors for a set of feature maps and
image sizes.
The module support computing anchors at multiple sizes and aspect ratios
per feature map.
sizes and aspect_ratios should have the same number of elements, and it should
correspond to the number of feature maps.
sizes[i] and aspect_ratios[i] can have an arbitrary number of elements.
For 2D images, anchor width and height w:h = 1:aspect_ratios[i,j]
For 3D images, anchor width, height, and depth w:h:d = 1:aspect_ratios[i,j,0]:aspect_ratios[i,j,1]
AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
per spatial location for feature map i.
Args:
sizes: base size of each anchor.
len(sizes) is the number of feature maps, i.e., the number of output levels for
the feature pyramid network (FPN).
Each element of ``sizes`` is a Sequence which represents several anchor sizes for each feature map.
aspect_ratios: the aspect ratios of anchors. ``len(aspect_ratios) = len(sizes)``.
For 2D images, each element of ``aspect_ratios[i]`` is a Sequence of float.
For 3D images, each element of ``aspect_ratios[i]`` is a Sequence of 2 value Sequence.
indexing: choose from {``'ij'``, ``'xy'``}, optional,
Matrix (``'ij'``, default and recommended) or Cartesian (``'xy'``) indexing of output.
- Matrix (``'ij'``, default and recommended) indexing keeps the original axis not changed.
- To use other monai detection components, please set ``indexing = 'ij'``.
- Cartesian (``'xy'``) indexing swaps axis 0 and 1.
- For 2D cases, monai ``AnchorGenerator(sizes, aspect_ratios, indexing='xy')`` and
``torchvision.models.detection.anchor_utils.AnchorGenerator(sizes, aspect_ratios)`` are equivalent.
Reference:.
https://github.com/pytorch/vision/blob/release/0.12/torchvision/models/detection/anchor_utils.py
Example:
.. code-block:: python
# 2D example inputs for a 2-level feature maps
sizes = ((10,12,14,16), (20,24,28,32))
base_aspect_ratios = (1., 0.5, 2.)
aspect_ratios = (base_aspect_ratios, base_aspect_ratios)
anchor_generator = AnchorGenerator(sizes, aspect_ratios)
# 3D example inputs for a 2-level feature maps
sizes = ((10,12,14,16), (20,24,28,32))
base_aspect_ratios = ((1., 1.), (1., 0.5), (0.5, 1.), (2., 2.))
aspect_ratios = (base_aspect_ratios, base_aspect_ratios)
anchor_generator = AnchorGenerator(sizes, aspect_ratios)
"""
__annotations__ = {"cell_anchors": List[torch.Tensor]}
def __init__(
self,
sizes: Sequence[Sequence[int]] = ((20, 30, 40),),
aspect_ratios: Sequence = (((0.5, 1), (1, 0.5)),),
indexing: str = "ij",
) -> None:
super().__init__()
if not issequenceiterable(sizes[0]):
self.sizes = tuple((s,) for s in sizes)
else:
self.sizes = ensure_tuple(sizes)
if not issequenceiterable(aspect_ratios[0]):
aspect_ratios = (aspect_ratios,) * len(self.sizes)
if len(self.sizes) != len(aspect_ratios):
raise ValueError(
"len(sizes) and len(aspect_ratios) should be equal. \
It represents the number of feature maps."
)
spatial_dims = len(ensure_tuple(aspect_ratios[0][0])) + 1
spatial_dims = look_up_option(spatial_dims, [2, 3])
self.spatial_dims = spatial_dims
self.indexing = look_up_option(indexing, ["ij", "xy"])
self.aspect_ratios = aspect_ratios
self.cell_anchors = [
self.generate_anchors(size, aspect_ratio) for size, aspect_ratio in zip(self.sizes, aspect_ratios)
]
# This comment comes from torchvision.
# TODO: https://github.com/pytorch/pytorch/issues/26792
# For every (aspect_ratios, scales) combination, output a zero-centered anchor with those values.
# (scales, aspect_ratios) are usually an element of zip(self.scales, self.aspect_ratios)
# This method assumes aspect ratio = height / width for an anchor.
[docs]
def generate_anchors(
self,
scales: Sequence,
aspect_ratios: Sequence,
dtype: torch.dtype = torch.float32,
device: torch.device | None = None,
) -> torch.Tensor:
"""
Compute cell anchor shapes at multiple sizes and aspect ratios for the current feature map.
Args:
scales: a sequence which represents several anchor sizes for the current feature map.
aspect_ratios: a sequence which represents several aspect_ratios for the current feature map.
For 2D images, it is a Sequence of float aspect_ratios[j],
anchor width and height w:h = 1:aspect_ratios[j].
For 3D images, it is a Sequence of 2 value Sequence aspect_ratios[j,0] and aspect_ratios[j,1],
anchor width, height, and depth w:h:d = 1:aspect_ratios[j,0]:aspect_ratios[j,1]
dtype: target data type of the output Tensor.
device: target device to put the output Tensor data.
Returns:
For each s in scales, returns [s, s*aspect_ratios[j]] for 2D images,
and [s, s*aspect_ratios[j,0],s*aspect_ratios[j,1]] for 3D images.
"""
scales_t = torch.as_tensor(scales, dtype=dtype, device=device) # sized (N,)
aspect_ratios_t = torch.as_tensor(aspect_ratios, dtype=dtype, device=device) # sized (M,) or (M,2)
if (self.spatial_dims >= 3) and (len(aspect_ratios_t.shape) != 2):
raise ValueError(
f"In {self.spatial_dims}-D image, aspect_ratios for each level should be \
{len(aspect_ratios_t.shape)-1}-D. But got aspect_ratios with shape {aspect_ratios_t.shape}."
)
if (self.spatial_dims >= 3) and (aspect_ratios_t.shape[1] != self.spatial_dims - 1):
raise ValueError(
f"In {self.spatial_dims}-D image, aspect_ratios for each level should has \
shape (_,{self.spatial_dims-1}). But got aspect_ratios with shape {aspect_ratios_t.shape}."
)
# if 2d, w:h = 1:aspect_ratios
if self.spatial_dims == 2:
area_scale = torch.sqrt(aspect_ratios_t)
w_ratios = 1 / area_scale
h_ratios = area_scale
# if 3d, w:h:d = 1:aspect_ratios[:,0]:aspect_ratios[:,1]
elif self.spatial_dims == 3:
area_scale = torch.pow(aspect_ratios_t[:, 0] * aspect_ratios_t[:, 1], 1 / 3.0)
w_ratios = 1 / area_scale
h_ratios = aspect_ratios_t[:, 0] / area_scale
d_ratios = aspect_ratios_t[:, 1] / area_scale
ws = (w_ratios[:, None] * scales_t[None, :]).view(-1)
hs = (h_ratios[:, None] * scales_t[None, :]).view(-1)
if self.spatial_dims == 2:
base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2.0
elif self.spatial_dims == 3:
ds = (d_ratios[:, None] * scales_t[None, :]).view(-1)
base_anchors = torch.stack([-ws, -hs, -ds, ws, hs, ds], dim=1) / 2.0
return base_anchors.round()
[docs]
def set_cell_anchors(self, dtype: torch.dtype, device: torch.device) -> None:
"""
Convert each element in self.cell_anchors to ``dtype`` and send to ``device``.
"""
self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
[docs]
def num_anchors_per_location(self):
"""
Return number of anchor shapes for each feature map.
"""
return [c.shape[0] for c in self.cell_anchors]
[docs]
def grid_anchors(self, grid_sizes: list[list[int]], strides: list[list[Tensor]]) -> list[Tensor]:
"""
Every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:spatial_dims)
corresponds to a feature map.
It outputs g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
Args:
grid_sizes: spatial size of the feature maps
strides: strides of the feature maps regarding to the original image
Example:
.. code-block:: python
grid_sizes = [[100,100],[50,50]]
strides = [[torch.tensor(2),torch.tensor(2)], [torch.tensor(4),torch.tensor(4)]]
"""
anchors = []
cell_anchors = self.cell_anchors
if cell_anchors is None:
raise AssertionError
if not (len(grid_sizes) == len(strides) == len(cell_anchors)):
raise ValueError(
"Anchors should be Tuple[Tuple[int]] because each feature "
"map could potentially have different sizes and aspect ratios. "
"There needs to be a match between the number of "
"feature maps passed and the number of sizes / aspect ratios specified."
)
for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
# for each feature map
device = base_anchors.device
# compute anchor centers regarding to the image.
# shifts_centers is [x_center, y_center] or [x_center, y_center, z_center]
shifts_centers = [
torch.arange(0, size[axis], dtype=torch.int32, device=device) * stride[axis]
for axis in range(self.spatial_dims)
]
# to support torchscript, cannot directly use torch.meshgrid(shifts_centers).
shifts_centers = list(torch.meshgrid(shifts_centers[: self.spatial_dims], indexing="ij"))
for axis in range(self.spatial_dims):
# each element of shifts_centers is sized (HW,) or (HWD,)
shifts_centers[axis] = shifts_centers[axis].reshape(-1)
# Expand to [x_center, y_center, x_center, y_center],
# or [x_center, y_center, z_center, x_center, y_center, z_center]
if self.indexing == "xy":
# Cartesian ('xy') indexing swaps axis 0 and 1.
shifts_centers[1], shifts_centers[0] = shifts_centers[0], shifts_centers[1]
shifts = torch.stack(shifts_centers * 2, dim=1) # sized (HW,4) or (HWD,6)
# For every (base anchor, output anchor) pair,
# offset each zero-centered base anchor by the center of the output anchor.
anchors.append(
(shifts.view(-1, 1, self.spatial_dims * 2) + base_anchors.view(1, -1, self.spatial_dims * 2)).reshape(
-1, self.spatial_dims * 2
) # each element sized (AHWD,4) or (AHWD,6)
)
return anchors
[docs]
def forward(self, images: Tensor, feature_maps: list[Tensor]) -> list[Tensor]:
"""
Generate anchor boxes for each image.
Args:
images: sized (B, C, W, H) or (B, C, W, H, D)
feature_maps: for FPN level i, feature_maps[i] is sized (B, C_i, W_i, H_i) or (B, C_i, W_i, H_i, D_i).
This input argument does not have to be the actual feature maps.
Any list variable with the same (C_i, W_i, H_i) or (C_i, W_i, H_i, D_i) as feature maps works.
Return:
A list with length of B. Each element represents the anchors for this image.
The B elements are identical.
Example:
.. code-block:: python
images = torch.zeros((3,1,128,128,128))
feature_maps = [torch.zeros((3,6,64,64,32)), torch.zeros((3,6,32,32,16))]
anchor_generator(images, feature_maps)
"""
grid_sizes = [list(feature_map.shape[-self.spatial_dims :]) for feature_map in feature_maps]
image_size = images.shape[-self.spatial_dims :]
batchsize = images.shape[0]
dtype, device = feature_maps[0].dtype, feature_maps[0].device
strides = [
[
torch.tensor(image_size[axis] // g[axis], dtype=torch.int64, device=device)
for axis in range(self.spatial_dims)
]
for g in grid_sizes
]
self.set_cell_anchors(dtype, device)
anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
anchors_per_image = torch.cat(list(anchors_over_all_feature_maps))
return [anchors_per_image] * batchsize
[docs]
class AnchorGeneratorWithAnchorShape(AnchorGenerator):
"""
Module that generates anchors for a set of feature maps and
image sizes, inherited from :py:class:`~monai.apps.detection.networks.utils.anchor_utils.AnchorGenerator`
The module support computing anchors at multiple base anchor shapes
per feature map.
``feature_map_scales`` should have the same number of elements with the number of feature maps.
base_anchor_shapes can have an arbitrary number of elements.
For 2D images, each element represents anchor width and height [w,h].
For 2D images, each element represents anchor width, height, and depth [w,h,d].
AnchorGenerator will output a set of ``len(base_anchor_shapes)`` anchors
per spatial location for feature map ``i``.
Args:
feature_map_scales: scale of anchors for each feature map, i.e., each output level of
the feature pyramid network (FPN). ``len(feature_map_scales)`` is the number of feature maps.
``scale[i]*base_anchor_shapes`` represents the anchor shapes for feature map ``i``.
base_anchor_shapes: a sequence which represents several anchor shapes for one feature map.
For N-D images, it is a Sequence of N value Sequence.
indexing: choose from {'xy', 'ij'}, optional
Cartesian ('xy') or matrix ('ij', default) indexing of output.
Cartesian ('xy') indexing swaps axis 0 and 1, which is the setting inside torchvision.
matrix ('ij', default) indexing keeps the original axis not changed.
See also indexing in https://pytorch.org/docs/stable/generated/torch.meshgrid.html
Example:
.. code-block:: python
# 2D example inputs for a 2-level feature maps
feature_map_scales = (1, 2)
base_anchor_shapes = ((10, 10), (6, 12), (12, 6))
anchor_generator = AnchorGeneratorWithAnchorShape(feature_map_scales, base_anchor_shapes)
# 3D example inputs for a 2-level feature maps
feature_map_scales = (1, 2)
base_anchor_shapes = ((10, 10, 10), (12, 12, 8), (10, 10, 6), (16, 16, 10))
anchor_generator = AnchorGeneratorWithAnchorShape(feature_map_scales, base_anchor_shapes)
"""
__annotations__ = {"cell_anchors": List[torch.Tensor]}
def __init__(
self,
feature_map_scales: Sequence[int] | Sequence[float] = (1, 2, 4, 8),
base_anchor_shapes: Sequence[Sequence[int]] | Sequence[Sequence[float]] = (
(32, 32, 32),
(48, 20, 20),
(20, 48, 20),
(20, 20, 48),
),
indexing: str = "ij",
) -> None:
nn.Module.__init__(self)
spatial_dims = len(base_anchor_shapes[0])
spatial_dims = look_up_option(spatial_dims, [2, 3])
self.spatial_dims = spatial_dims
self.indexing = look_up_option(indexing, ["ij", "xy"])
base_anchor_shapes_t = torch.Tensor(base_anchor_shapes)
self.cell_anchors = [self.generate_anchors_using_shape(s * base_anchor_shapes_t) for s in feature_map_scales]
[docs]
@staticmethod
def generate_anchors_using_shape(
anchor_shapes: torch.Tensor, dtype: torch.dtype = torch.float32, device: torch.device | None = None
) -> torch.Tensor:
"""
Compute cell anchor shapes at multiple sizes and aspect ratios for the current feature map.
Args:
anchor_shapes: [w, h] or [w, h, d], sized (N, spatial_dims),
represents N anchor shapes for the current feature map.
dtype: target data type of the output Tensor.
device: target device to put the output Tensor data.
Returns:
For 2D images, returns [-w/2, -h/2, w/2, h/2];
For 3D images, returns [-w/2, -h/2, -d/2, w/2, h/2, d/2]
"""
half_anchor_shapes = anchor_shapes / 2.0
base_anchors = torch.cat([-half_anchor_shapes, half_anchor_shapes], dim=1)
return base_anchors.round().to(dtype=dtype, device=device)
```