# Source code for monai.visualize.occlusion_sensitivity

```
# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from collections.abc import Callable, Mapping, Sequence
from typing import Any
import numpy as np
import torch
import torch.nn as nn
from monai.data.meta_tensor import MetaTensor
from monai.networks.utils import eval_mode
from monai.transforms import Compose, GaussianSmooth, Lambda, ScaleIntensity, SpatialCrop
from monai.utils import ensure_tuple_rep
[docs]
class OcclusionSensitivity:
"""
This class computes the occlusion sensitivity for a model's prediction of a given image. By occlusion sensitivity,
we mean how the probability of a given prediction changes as the occluded section of an image changes. This can be
useful to understand why a network is making certain decisions.
As important parts of the image are occluded, the probability of classifying the image correctly will decrease.
Hence, more negative values imply the corresponding occluded volume was more important in the decision process.
Two ``torch.Tensor`` will be returned by the ``__call__`` method: an occlusion map and an image of the most probable
class. Both images will be cropped if a bounding box used, but voxel sizes will always match the input.
The occlusion map shows the inference probabilities when the corresponding part of the image is occluded. Hence,
more -ve values imply that region was important in the decision process. The map will have shape ``BCHW(D)N``,
where ``N`` is the number of classes to be inferred by the network. Hence, the occlusion for class ``i`` can
be seen with ``map[...,i]``.
The most probable class is an image of the probable class when the corresponding part of the image is occluded
(equivalent to ``occ_map.argmax(dim=-1)``).
See: R. R. Selvaraju et al. Grad-CAM: Visual Explanations from Deep Networks via
Gradient-based Localization. https://doi.org/10.1109/ICCV.2017.74.
Examples:
.. code-block:: python
# densenet 2d
from monai.networks.nets import DenseNet121
from monai.visualize import OcclusionSensitivity
import torch
model_2d = DenseNet121(spatial_dims=2, in_channels=1, out_channels=3)
occ_sens = OcclusionSensitivity(nn_module=model_2d)
occ_map, most_probable_class = occ_sens(x=torch.rand((1, 1, 48, 64)), b_box=[2, 40, 1, 62])
# densenet 3d
from monai.networks.nets import DenseNet
from monai.visualize import OcclusionSensitivity
model_3d = DenseNet(spatial_dims=3, in_channels=1, out_channels=3, init_features=2, growth_rate=2, block_config=(6,))
occ_sens = OcclusionSensitivity(nn_module=model_3d, n_batch=10)
occ_map, most_probable_class = occ_sens(torch.rand(1, 1, 6, 6, 6), b_box=[1, 3, -1, -1, -1, -1])
See Also:
- :py:class:`monai.visualize.occlusion_sensitivity.OcclusionSensitivity.`
"""
[docs]
def __init__(
self,
nn_module: nn.Module,
mask_size: int | Sequence = 16,
n_batch: int = 16,
verbose: bool = True,
mode: str | float | Callable = "gaussian",
overlap: float = 0.25,
activate: bool | Callable = True,
) -> None:
"""
Occlusion sensitivity constructor.
Args:
nn_module: Classification model to use for inference
mask_size: Size of box to be occluded, centred on the central voxel. If a single number
is given, this is used for all dimensions. If a sequence is given, this is used for each dimension
individually.
n_batch: Number of images in a batch for inference.
verbose: Use progress bar (if ``tqdm`` available).
mode: what should the occluded region be replaced with? If a float is given, that value will be used
throughout the occlusion. Else, ``gaussian``, ``mean_img`` and ``mean_patch`` can be supplied:
* ``gaussian``: occluded region is multiplied by 1 - gaussian kernel. In this fashion, the occlusion
will be 0 at the center and will be unchanged towards the edges, varying smoothly between. When
gaussian is used, a weighted average will be used to combine overlapping regions. This will be
done using the gaussian (not 1-gaussian) as occluded regions count more.
* ``mean_patch``: occluded region will be replaced with the mean of occluded region.
* ``mean_img``: occluded region will be replaced with the mean of the whole image.
overlap: overlap between inferred regions. Should be in range 0<=x<1.
activate: if ``True``, do softmax activation if num_channels > 1 else do ``sigmoid``. If ``False``, don't do any
activation. If ``callable``, use callable on inferred outputs.
"""
self.nn_module = nn_module
self.mask_size = mask_size
self.n_batch = n_batch
self.verbose = verbose
self.overlap = overlap
self.activate = activate
# mode
if isinstance(mode, str) and mode not in ("gaussian", "mean_patch", "mean_img"):
raise NotImplementedError
self.mode = mode
[docs]
@staticmethod
def constant_occlusion(x: torch.Tensor, val: float, mask_size: Sequence) -> tuple[float, torch.Tensor]:
"""Occlude with a constant occlusion. Multiplicative is zero, additive is constant value."""
ones = torch.ones((*x.shape[:2], *mask_size), device=x.device, dtype=x.dtype)
return 0, ones * val
[docs]
@staticmethod
def gaussian_occlusion(x: torch.Tensor, mask_size: Sequence, sigma: float = 0.25) -> tuple[torch.Tensor, float]:
"""
For Gaussian occlusion, Multiplicative is 1-Gaussian, additive is zero.
Default sigma of 0.25 empirically shown to give reasonable kernel, see here:
https://github.com/Project-MONAI/MONAI/pull/5230#discussion_r984520714.
"""
kernel = torch.zeros((x.shape[1], *mask_size), device=x.device, dtype=x.dtype)
spatial_shape = kernel.shape[1:]
# all channels (as occluded shape already takes into account per_channel), center in spatial dimensions
center = [slice(None)] + [slice(s // 2, s // 2 + 1) for s in spatial_shape]
# place value of 1 at center
kernel[center] = 1.0
# Smooth with sigma equal to quarter of image, flip +ve/-ve so largest values are at edge
# and smallest at center. Scale to [0, 1].
gaussian = Compose(
[GaussianSmooth(sigma=[b * sigma for b in spatial_shape]), Lambda(lambda x: -x), ScaleIntensity()]
)
# transform and add batch
mul: torch.Tensor = gaussian(kernel)[None]
return mul, 0
[docs]
@staticmethod
def predictor(
cropped_grid: torch.Tensor,
nn_module: nn.Module,
x: torch.Tensor,
mul: torch.Tensor | float,
add: torch.Tensor | float,
mask_size: Sequence,
occ_mode: str,
activate: bool | Callable,
module_kwargs: Mapping[str, Any],
) -> torch.Tensor:
"""
Predictor function to be passed to the sliding window inferer. Takes a cropped meshgrid,
referring to the coordinates in the input image. We use the index of the top-left corner
in combination ``mask_size`` to figure out which region of the image is to be occluded. The
occlusion is performed on the original image, ``x``, using ``cropped_region * mul + add``. ``mul``
and ``add`` are sometimes pre-computed (e.g., a constant Gaussian blur), or they are
sometimes calculated on the fly (e.g., the mean of the occluded patch). For this reason
``occ_mode`` is given. Lastly, ``activate`` is used to activate after each call of the model.
Args:
cropped_grid: subsection of the meshgrid, where each voxel refers to the coordinate of
the input image. The meshgrid is created by the ``OcclusionSensitivity`` class, and
the generation of the subset is determined by ``sliding_window_inference``.
nn_module: module to call on data.
x: the image that was originally passed into ``OcclusionSensitivity.__call__``.
mul: occluded region will be multiplied by this. Can be ``torch.Tensor`` or ``float``.
add: after multiplication, this is added to the occluded region. Can be ``torch.Tensor`` or ``float``.
mask_size: Size of box to be occluded, centred on the central voxel. Should be
a sequence, one value for each spatial dimension.
occ_mode: might be used to calculate ``mul`` and ``add`` on the fly.
activate: if ``True``, do softmax activation if num_channels > 1 else do ``sigmoid``. If ``False``, don't do any
activation. If ``callable``, use callable on inferred outputs.
module_kwargs: kwargs to be passed onto module when inferring
"""
n_batch = cropped_grid.shape[0]
sd = cropped_grid.ndim - 2
# start with copies of x to infer
im = torch.repeat_interleave(x, n_batch, 0)
# get coordinates of top left corner of occluded region (possible because we use meshgrid)
corner_coord_slices = [slice(None)] * 2 + [slice(1)] * sd
top_corners = cropped_grid[corner_coord_slices]
# replace occluded regions
for b, t in enumerate(top_corners):
# starting from corner, get the slices to extract the occluded region from the image
slices = [slice(b, b + 1), slice(None)] + [slice(int(j), int(j) + m) for j, m in zip(t, mask_size)]
to_occlude = im[slices]
if occ_mode == "mean_patch":
add, mul = OcclusionSensitivity.constant_occlusion(x, to_occlude.mean().item(), mask_size)
if callable(occ_mode):
to_occlude = occ_mode(x, to_occlude)
else:
to_occlude = to_occlude * mul + add
if add is None or mul is None:
raise RuntimeError("Shouldn't be here, something's gone wrong...")
im[slices] = to_occlude
# infer
out: torch.Tensor = nn_module(im, **module_kwargs)
# if activation is callable, call it
if callable(activate):
out = activate(out)
# else if True (should be boolean), sigmoid if n_chan == 1 else softmax
elif activate:
out = out.sigmoid() if x.shape[1] == 1 else out.softmax(1)
# the output will have shape [B,C] where C is number of channels output by model (inference classes)
# we need to return it to sliding window inference with shape [B,C,H,W,[D]], so add dims and repeat values
for m in mask_size:
out = torch.repeat_interleave(out.unsqueeze(-1), m, dim=-1)
return out
[docs]
@staticmethod
def crop_meshgrid(
grid: MetaTensor, b_box: Sequence, mask_size: Sequence
) -> tuple[MetaTensor, SpatialCrop, Sequence]:
"""Crop the meshgrid so we only perform occlusion sensitivity on a subsection of the image."""
# distance from center of mask to edge is -1 // 2.
mask_edge = [(m - 1) // 2 for m in mask_size]
bbox_min = [max(b - m, 0) for b, m in zip(b_box[::2], mask_edge)]
bbox_max = []
for b, m, s in zip(b_box[1::2], mask_edge, grid.shape[2:]):
# if bbox is -ve for that dimension, no cropping so use current image size
if b == -1:
bbox_max.append(s)
# else bounding box plus distance to mask edge. Make sure it's not bigger than the size of the image
else:
bbox_max.append(min(b + m, s))
# bbox_max = [min(b + m, s) if b >= 0 else s for b, m, s in zip(b_box[1::2], mask_edge, grid.shape[2:])]
# No need for batch and channel slices. Batch will be removed and added back in, and
# SpatialCrop doesn't act on the first dimension anyway.
slices = [slice(s, e) for s, e in zip(bbox_min, bbox_max)]
cropper = SpatialCrop(roi_slices=slices)
cropped: MetaTensor = cropper(grid[0])[None] # type: ignore
mask_size = list(mask_size)
for i, s in enumerate(cropped.shape[2:]):
mask_size[i] = min(s, mask_size[i])
return cropped, cropper, mask_size
def __call__(
self, x: torch.Tensor, b_box: Sequence | None = None, **kwargs: Any
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Args:
x: Image to use for inference. Should be a tensor consisting of 1 batch.
b_box: Bounding box on which to perform the analysis. The output image will be limited to this size.
There should be a minimum and maximum for all spatial dimensions: ``[min1, max1, min2, max2,...]``.
* By default, the whole image will be used. Decreasing the size will speed the analysis up, which might
be useful for larger images.
* Min and max are inclusive, so ``[0, 63, ...]`` will have size ``(64, ...)``.
* Use -ve to use ``min=0`` and ``max=im.shape[x]-1`` for xth dimension.
* N.B.: we add half of the mask size to the bounding box to ensure that the region of interest has a
sufficiently large area surrounding it.
kwargs: any extra arguments to be passed on to the module as part of its `__call__`.
Returns:
* Occlusion map:
* Shows the inference probabilities when the corresponding part of the image is occluded.
Hence, more -ve values imply that region was important in the decision process.
* The map will have shape ``BCHW(D)N``, where N is the number of classes to be inferred by the
network. Hence, the occlusion for class ``i`` can be seen with ``map[...,i]``.
* If `per_channel==False`, output ``C`` will equal 1: ``B1HW(D)N``
* Most probable class:
* The most probable class when the corresponding part of the image is occluded (``argmax(dim=-1)``).
Both images will be cropped if a bounding box used, but voxel sizes will always match the input.
"""
if x.shape[0] > 1:
raise ValueError("Expected batch size of 1.")
sd = x.ndim - 2
mask_size: Sequence = ensure_tuple_rep(self.mask_size, sd)
# get the meshgrid (so that sliding_window_inference can tell us which bit to occlude)
grid: MetaTensor = MetaTensor(
np.stack(np.meshgrid(*[np.arange(0, i) for i in x.shape[2:]], indexing="ij"))[None],
device=x.device,
dtype=x.dtype,
)
# if bounding box given, crop the grid to only infer subsections of the image
if b_box is not None:
grid, cropper, mask_size = self.crop_meshgrid(grid, b_box, mask_size)
# check that the grid is bigger than the mask size
if any(m > g for g, m in zip(grid.shape[2:], mask_size)):
raise ValueError(f"Image (spatial shape) {grid.shape[2:]} should be bigger than mask {mask_size}.")
# get additive and multiplicative factors if they are unchanged for all patches (i.e., not mean_patch)
add: float | torch.Tensor | None
mul: float | torch.Tensor | None
# multiply by 0, add value
if isinstance(self.mode, float):
mul, add = self.constant_occlusion(x, self.mode, mask_size)
# multiply by 0, add mean of image
elif self.mode == "mean_img":
mul, add = self.constant_occlusion(x, x.mean().item(), mask_size)
# for gaussian, additive = 0, multiplicative = gaussian
elif self.mode == "gaussian":
mul, add = self.gaussian_occlusion(x, mask_size)
# else will be determined on each patch individually so calculated later
else:
add, mul = None, None
with eval_mode(self.nn_module):
# needs to go here to avoid circular import
from monai.inferers import sliding_window_inference
sensitivity_im: MetaTensor = sliding_window_inference( # type: ignore
grid,
roi_size=mask_size,
sw_batch_size=self.n_batch,
predictor=OcclusionSensitivity.predictor,
overlap=self.overlap,
mode="gaussian" if self.mode == "gaussian" else "constant",
progress=self.verbose,
nn_module=self.nn_module,
x=x,
add=add,
mul=mul,
mask_size=mask_size,
occ_mode=self.mode,
activate=self.activate,
module_kwargs=kwargs,
)
if b_box is not None:
# undo the cropping that was applied to the meshgrid
sensitivity_im = cropper.inverse(sensitivity_im[0])[None] # type: ignore
# crop using the bounding box (ignoring the mask size this time)
bbox_min = [max(b, 0) for b in b_box[::2]]
bbox_max = [b if b > 0 else s for b, s in zip(b_box[1::2], x.shape[2:])]
cropper = SpatialCrop(roi_start=bbox_min, roi_end=bbox_max)
sensitivity_im = cropper(sensitivity_im[0])[None] # type: ignore
# The most probable class is the max in the classification dimension (1)
most_probable_class = sensitivity_im.argmax(dim=1, keepdim=True)
return sensitivity_im, most_probable_class
```