# Source code for monai.visualize.occlusion_sensitivity

```
# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Sequence
from functools import partial
from typing import Callable, Optional, Tuple, Union
import numpy as np
import torch
import torch.nn as nn
from monai.networks.utils import eval_mode
from monai.visualize.visualizer import default_upsampler
try:
from tqdm import trange
trange = partial(trange, desc="Computing occlusion sensitivity")
except (ImportError, AttributeError):
trange = range
# For stride two (for example),
# if input array is: |0|1|2|3|4|5|6|7|
# downsampled output is: | 0 | 1 | 2 | 3 |
# So the upsampling should do it by the corners of the image, not their centres
default_upsampler = partial(default_upsampler, align_corners=True)
def _check_input_image(image):
"""Check that the input image is as expected."""
# Only accept batch size of 1
if image.shape[0] > 1:
raise RuntimeError("Expected batch size of 1.")
def _check_input_bounding_box(b_box, im_shape):
"""Check that the bounding box (if supplied) is as expected."""
# If no bounding box has been supplied, set min and max to None
if b_box is None:
b_box_min = b_box_max = None
# Bounding box has been supplied
else:
# Should be twice as many elements in `b_box` as `im_shape`
if len(b_box) != 2 * len(im_shape):
raise ValueError("Bounding box should contain upper and lower for all dimensions (except batch number)")
# If any min's or max's are -ve, set them to 0 and im_shape-1, respectively.
b_box_min = np.array(b_box[::2])
b_box_max = np.array(b_box[1::2])
b_box_min[b_box_min < 0] = 0
b_box_max[b_box_max < 0] = im_shape[b_box_max < 0] - 1
# Check all max's are < im_shape
if np.any(b_box_max >= im_shape):
raise ValueError("Max bounding box should be < image size for all values")
# Check all min's are <= max's
if np.any(b_box_min > b_box_max):
raise ValueError("Min bounding box should be <= max for all values")
return b_box_min, b_box_max
def _append_to_sensitivity_ims(model, batch_images, sensitivity_ims):
"""Infer given images. Append to previous evaluations. Store each class separately."""
batch_images = torch.cat(batch_images, dim=0)
scores = model(batch_images).detach()
for i in range(scores.shape[1]):
sensitivity_ims[i] = torch.cat((sensitivity_ims[i], scores[:, i]))
return sensitivity_ims
def _get_as_np_array(val, numel):
# If not a sequence, then convert scalar to numpy array
if not isinstance(val, Sequence):
out = np.full(numel, val, dtype=np.int32)
out[0] = 1 # mask_size and stride always 1 in channel dimension
else:
# Convert to numpy array and check dimensions match
out = np.array(val, dtype=np.int32)
# Add stride of 1 to the channel direction (since user input was only for spatial dimensions)
out = np.insert(out, 0, 1)
if out.size != numel:
raise ValueError(
"If supplying stride/mask_size as sequence, number of elements should match number of spatial dimensions."
)
return out
[docs]class OcclusionSensitivity:
"""
This class computes the occlusion sensitivity for a model's prediction of a given image. By occlusion sensitivity,
we mean how the probability of a given prediction changes as the occluded section of an image changes. This can be
useful to understand why a network is making certain decisions.
As important parts of the image are occluded, the probability of classifying the image correctly will decrease.
Hence, more negative values imply the corresponding occluded volume was more important in the decision process.
Two ``torch.Tensor`` will be returned by the ``__call__`` method: an occlusion map and an image of the most probable
class. Both images will be cropped if a bounding box used, but voxel sizes will always match the input.
The occlusion map shows the inference probabilities when the corresponding part of the image is occluded. Hence,
more -ve values imply that region was important in the decision process. The map will have shape ``BCHW(D)N``,
where ``N`` is the number of classes to be inferred by the network. Hence, the occlusion for class ``i`` can
be seen with ``map[...,i]``.
The most probable class is an image of the probable class when the corresponding part of the image is occluded
(equivalent to ``occ_map.argmax(dim=-1)``).
See: R. R. Selvaraju et al. Grad-CAM: Visual Explanations from Deep Networks via
Gradient-based Localization. https://doi.org/10.1109/ICCV.2017.74.
Examples:
.. code-block:: python
# densenet 2d
from monai.networks.nets import DenseNet121
from monai.visualize import OcclusionSensitivity
model_2d = DenseNet121(spatial_dims=2, in_channels=1, out_channels=3)
occ_sens = OcclusionSensitivity(nn_module=model_2d)
occ_map, most_probable_class = occ_sens(x=torch.rand((1, 1, 48, 64)), b_box=[-1, -1, 2, 40, 1, 62])
# densenet 3d
from monai.networks.nets import DenseNet
from monai.visualize import OcclusionSensitivity
model_3d = DenseNet(spatial_dims=3, in_channels=1, out_channels=3, init_features=2, growth_rate=2, block_config=(6,))
occ_sens = OcclusionSensitivity(nn_module=model_3d, n_batch=10, stride=3)
occ_map, most_probable_class = occ_sens(torch.rand(1, 1, 6, 6, 6), b_box=[-1, -1, 1, 3, -1, -1, -1, -1])
See Also:
- :py:class:`monai.visualize.occlusion_sensitivity.OcclusionSensitivity.`
"""
[docs] def __init__(
self,
nn_module: nn.Module,
pad_val: Optional[float] = None,
mask_size: Union[int, Sequence] = 15,
n_batch: int = 128,
stride: Union[int, Sequence] = 1,
per_channel: bool = True,
upsampler: Optional[Callable] = default_upsampler,
verbose: bool = True,
) -> None:
"""Occlusion sensitivity constructor.
Args:
nn_module: Classification model to use for inference
pad_val: When occluding part of the image, which values should we put
in the image? If ``None`` is used, then the average of the image will be used.
mask_size: Size of box to be occluded, centred on the central voxel. To ensure that the occluded area
is correctly centred, ``mask_size`` and ``stride`` should both be odd or even.
n_batch: Number of images in a batch for inference.
stride: Stride in spatial directions for performing occlusions. Can be single
value or sequence (for varying stride in the different directions).
Should be >= 1. Striding in the channel direction depends on the `per_channel` argument.
per_channel: If `True`, `mask_size` and `stride` both equal 1 in the channel dimension. If `False`,
then both `mask_size` equals the number of channels in the image. If `True`, the output image will be:
`[B, C, H, W, D, num_seg_classes]`. Else, will be `[B, 1, H, W, D, num_seg_classes]`
upsampler: An upsampling method to upsample the output image. Default is
N-dimensional linear (bilinear, trilinear, etc.) depending on num spatial
dimensions of input.
verbose: Use ``tqdm.trange`` output (if available).
"""
self.nn_module = nn_module
self.upsampler = upsampler
self.pad_val = pad_val
self.mask_size = mask_size
self.n_batch = n_batch
self.stride = stride
self.per_channel = per_channel
self.verbose = verbose
def _compute_occlusion_sensitivity(self, x, b_box):
# Get bounding box
im_shape = np.array(x.shape[1:])
b_box_min, b_box_max = _check_input_bounding_box(b_box, im_shape)
# Get the number of prediction classes
num_classes = self.nn_module(x).numel()
# If pad val not supplied, get the mean of the image
pad_val = x.mean() if self.pad_val is None else self.pad_val
# List containing a batch of images to be inferred
batch_images = []
# List of sensitivity images, one for each inferred class
sensitivity_ims = num_classes * [torch.empty(0, dtype=torch.float32, device=x.device)]
# If no bounding box supplied, output shape is same as input shape.
# If bounding box is present, shape is max - min + 1
output_im_shape = im_shape if b_box is None else b_box_max - b_box_min + 1
# Get the stride and mask_size as numpy arrays
stride = _get_as_np_array(self.stride, len(im_shape))
mask_size = _get_as_np_array(self.mask_size, len(im_shape))
# If not doing it on a per-channel basis, then the output image will have 1 output channel
# (since all will be occluded together)
if not self.per_channel:
output_im_shape[0] = 1
stride[0] = x.shape[1]
mask_size[0] = x.shape[1]
# For each dimension, ...
for o, s in zip(output_im_shape, stride):
# if the size is > 1, then check that the stride is a factor of the output image shape
if o > 1 and o % s != 0:
raise ValueError(
"Stride should be a factor of the image shape. Im shape "
+ f"(taking bounding box into account): {output_im_shape}, stride: {stride}"
)
# to ensure the occluded area is nicely centred if stride is even, ensure that so is the mask_size
if np.any(mask_size % 2 != stride % 2):
raise ValueError(
"Stride and mask size should both be odd or even (element-wise). "
+ f"``stride={stride}``, ``mask_size={mask_size}``"
)
downsampled_im_shape = (output_im_shape / stride).astype(np.int32)
downsampled_im_shape[downsampled_im_shape == 0] = 1 # make sure dimension sizes are >= 1
num_required_predictions = np.prod(downsampled_im_shape)
# Get bottom left and top right corners of occluded region
lower_corner = (stride - mask_size) // 2
upper_corner = (stride + mask_size) // 2
# Loop 1D over image
verbose_range = trange if self.verbose else range
for i in verbose_range(num_required_predictions):
# Get corresponding ND index
idx = np.unravel_index(i, downsampled_im_shape)
# Multiply by stride
idx *= stride
# If a bounding box is being used, we need to add on
# the min to shift to start of region of interest
if b_box_min is not None:
idx += b_box_min
# Get min and max index of box to occlude (and make sure it's in bounds)
min_idx = np.maximum(idx + lower_corner, 0)
max_idx = np.minimum(idx + upper_corner, im_shape)
# Clone and replace target area with `pad_val`
occlu_im = x.detach().clone()
occlu_im[(...,) + tuple(slice(i, j) for i, j in zip(min_idx, max_idx))] = pad_val
# Add to list
batch_images.append(occlu_im)
# Once the batch is complete (or on last iteration)
if len(batch_images) == self.n_batch or i == num_required_predictions - 1:
# Do the predictions and append to sensitivity maps
sensitivity_ims = _append_to_sensitivity_ims(self.nn_module, batch_images, sensitivity_ims)
# Clear lists
batch_images = []
# Reshape to match downsampled image, and unsqueeze to add batch dimension back in
for i in range(num_classes):
sensitivity_ims[i] = sensitivity_ims[i].reshape(tuple(downsampled_im_shape)).unsqueeze(0)
return sensitivity_ims, output_im_shape
def __call__(self, x: torch.Tensor, b_box: Optional[Sequence] = None) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Args:
x: Image to use for inference. Should be a tensor consisting of 1 batch.
b_box: Bounding box on which to perform the analysis. The output image will be limited to this size.
There should be a minimum and maximum for all dimensions except batch: ``[min1, max1, min2, max2,...]``.
* By default, the whole image will be used. Decreasing the size will speed the analysis up, which might
be useful for larger images.
* Min and max are inclusive, so ``[0, 63, ...]`` will have size ``(64, ...)``.
* Use -ve to use ``min=0`` and ``max=im.shape[x]-1`` for xth dimension.
Returns:
* Occlusion map:
* Shows the inference probabilities when the corresponding part of the image is occluded.
Hence, more -ve values imply that region was important in the decision process.
* The map will have shape ``BCHW(D)N``, where N is the number of classes to be inferred by the
network. Hence, the occlusion for class ``i`` can be seen with ``map[...,i]``.
* If `per_channel==False`, output ``C`` will equal 1: ``B1HW(D)N``
* Most probable class:
* The most probable class when the corresponding part of the image is occluded (``argmax(dim=-1)``).
Both images will be cropped if a bounding box used, but voxel sizes will always match the input.
"""
with eval_mode(self.nn_module):
# Check input arguments
_check_input_image(x)
# Generate sensitivity images
sensitivity_ims_list, output_im_shape = self._compute_occlusion_sensitivity(x, b_box)
# Loop over image for each classification
for i, sens_i in enumerate(sensitivity_ims_list):
# upsample
if self.upsampler is not None:
if len(sens_i.shape) != len(x.shape):
raise AssertionError
if np.any(sens_i.shape != x.shape):
img_spatial = tuple(output_im_shape[1:])
sensitivity_ims_list[i] = self.upsampler(img_spatial)(sens_i)
# Convert list of tensors to tensor
sensitivity_ims = torch.stack(sensitivity_ims_list, dim=-1)
# The most probable class is the max in the classification dimension (last)
most_probable_class = sensitivity_ims.argmax(dim=-1)
return sensitivity_ims, most_probable_class
```