Source code for monai.visualize.occlusion_sensitivity

# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from collections.abc import Callable, Mapping, Sequence
from typing import Any

import numpy as np
import torch
import torch.nn as nn

from monai.data.meta_tensor import MetaTensor
from monai.networks.utils import eval_mode
from monai.transforms import Compose, GaussianSmooth, Lambda, ScaleIntensity, SpatialCrop
from monai.utils import ensure_tuple_rep



[docs]
class OcclusionSensitivity:
    """
    This class computes the occlusion sensitivity for a model's prediction of a given image. By occlusion sensitivity,
    we mean how the probability of a given prediction changes as the occluded section of an image changes. This can be
    useful to understand why a network is making certain decisions.

    As important parts of the image are occluded, the probability of classifying the image correctly will decrease.
    Hence, more negative values imply the corresponding occluded volume was more important in the decision process.

    Two ``torch.Tensor`` will be returned by the ``__call__`` method: an occlusion map and an image of the most probable
    class. Both images will be cropped if a bounding box used, but voxel sizes will always match the input.

    The occlusion map shows the inference probabilities when the corresponding part of the image is occluded. Hence,
    more -ve values imply that region was important in the decision process. The map will have shape ``BCHW(D)N``,
    where ``N`` is the number of classes to be inferred by the network. Hence, the occlusion for class ``i`` can
    be seen with ``map[...,i]``.

    The most probable class is an image of the probable class when the corresponding part of the image is occluded
    (equivalent to ``occ_map.argmax(dim=-1)``).

    See: R. R. Selvaraju et al. Grad-CAM: Visual Explanations from Deep Networks via
    Gradient-based Localization. https://doi.org/10.1109/ICCV.2017.74.

    Examples:

    .. code-block:: python

        # densenet 2d
        from monai.networks.nets import DenseNet121
        from monai.visualize import OcclusionSensitivity
        import torch

        model_2d = DenseNet121(spatial_dims=2, in_channels=1, out_channels=3)
        occ_sens = OcclusionSensitivity(nn_module=model_2d)
        occ_map, most_probable_class = occ_sens(x=torch.rand((1, 1, 48, 64)), b_box=[2, 40, 1, 62])

        # densenet 3d
        from monai.networks.nets import DenseNet
        from monai.visualize import OcclusionSensitivity

        model_3d = DenseNet(spatial_dims=3, in_channels=1, out_channels=3, init_features=2, growth_rate=2, block_config=(6,))
        occ_sens = OcclusionSensitivity(nn_module=model_3d, n_batch=10)
        occ_map, most_probable_class = occ_sens(torch.rand(1, 1, 6, 6, 6), b_box=[1, 3, -1, -1, -1, -1])

    See Also:

        - :py:class:`monai.visualize.occlusion_sensitivity.OcclusionSensitivity.`
    """


[docs]
    def __init__(
        self,
        nn_module: nn.Module,
        mask_size: int | Sequence = 16,
        n_batch: int = 16,
        verbose: bool = True,
        mode: str | float | Callable = "gaussian",
        overlap: float = 0.25,
        activate: bool | Callable = True,
    ) -> None:
        """
        Occlusion sensitivity constructor.

        Args:
            nn_module: Classification model to use for inference
            mask_size: Size of box to be occluded, centred on the central voxel. If a single number
                is given, this is used for all dimensions. If a sequence is given, this is used for each dimension
                individually.
            n_batch: Number of images in a batch for inference.
            verbose: Use progress bar (if ``tqdm`` available).
            mode: what should the occluded region be replaced with? If a float is given, that value will be used
                throughout the occlusion. Else, ``gaussian``, ``mean_img`` and ``mean_patch`` can be supplied:

                * ``gaussian``: occluded region is multiplied by 1 - gaussian kernel. In this fashion, the occlusion
                  will be 0 at the center and will be unchanged towards the edges, varying smoothly between. When
                  gaussian is used, a weighted average will be used to combine overlapping regions. This will be
                  done using the gaussian (not 1-gaussian) as occluded regions count more.
                * ``mean_patch``: occluded region will be replaced with the mean of occluded region.
                * ``mean_img``: occluded region will be replaced with the mean of the whole image.

            overlap: overlap between inferred regions. Should be in range 0<=x<1.
            activate: if ``True``, do softmax activation if num_channels > 1 else do ``sigmoid``. If ``False``, don't do any
                activation. If ``callable``, use callable on inferred outputs.

        """
        self.nn_module = nn_module
        self.mask_size = mask_size
        self.n_batch = n_batch
        self.verbose = verbose
        self.overlap = overlap
        self.activate = activate
        # mode
        if isinstance(mode, str) and mode not in ("gaussian", "mean_patch", "mean_img"):
            raise NotImplementedError
        self.mode = mode



[docs]
    @staticmethod
    def constant_occlusion(x: torch.Tensor, val: float, mask_size: Sequence) -> tuple[float, torch.Tensor]:
        """Occlude with a constant occlusion. Multiplicative is zero, additive is constant value."""
        ones = torch.ones((*x.shape[:2], *mask_size), device=x.device, dtype=x.dtype)
        return 0, ones * val



[docs]
    @staticmethod
    def gaussian_occlusion(x: torch.Tensor, mask_size: Sequence, sigma: float = 0.25) -> tuple[torch.Tensor, float]:
        """
        For Gaussian occlusion, Multiplicative is 1-Gaussian, additive is zero.
        Default sigma of 0.25 empirically shown to give reasonable kernel, see here:
        https://github.com/Project-MONAI/MONAI/pull/5230#discussion_r984520714.
        """
        kernel = torch.zeros((x.shape[1], *mask_size), device=x.device, dtype=x.dtype)
        spatial_shape = kernel.shape[1:]
        # all channels (as occluded shape already takes into account per_channel), center in spatial dimensions
        center = [slice(None)] + [slice(s // 2, s // 2 + 1) for s in spatial_shape]
        # place value of 1 at center
        kernel[center] = 1.0
        # Smooth with sigma equal to quarter of image, flip +ve/-ve so largest values are at edge
        # and smallest at center. Scale to [0, 1].
        gaussian = Compose(
            [GaussianSmooth(sigma=[b * sigma for b in spatial_shape]), Lambda(lambda x: -x), ScaleIntensity()]
        )
        # transform and add batch
        mul: torch.Tensor = gaussian(kernel)[None]

        return mul, 0



[docs]
    @staticmethod
    def predictor(
        cropped_grid: torch.Tensor,
        nn_module: nn.Module,
        x: torch.Tensor,
        mul: torch.Tensor | float,
        add: torch.Tensor | float,
        mask_size: Sequence,
        occ_mode: str,
        activate: bool | Callable,
        module_kwargs: Mapping[str, Any],
    ) -> torch.Tensor:
        """
        Predictor function to be passed to the sliding window inferer. Takes a cropped meshgrid,
        referring to the coordinates in the input image. We use the index of the top-left corner
        in combination ``mask_size`` to figure out which region of the image is to be occluded. The
        occlusion is performed on the original image, ``x``, using ``cropped_region * mul + add``. ``mul``
        and ``add`` are sometimes pre-computed (e.g., a constant Gaussian blur), or they are
        sometimes calculated on the fly (e.g., the mean of the occluded patch). For this reason
        ``occ_mode`` is given. Lastly, ``activate`` is used to activate after each call of the model.

        Args:
            cropped_grid: subsection of the meshgrid, where each voxel refers to the coordinate of
                the input image. The meshgrid is created by the ``OcclusionSensitivity`` class, and
                the generation of the subset is determined by ``sliding_window_inference``.
            nn_module: module to call on data.
            x: the image that was originally passed into ``OcclusionSensitivity.__call__``.
            mul: occluded region will be multiplied by this. Can be ``torch.Tensor`` or ``float``.
            add: after multiplication, this is added to the occluded region. Can be ``torch.Tensor`` or ``float``.
            mask_size: Size of box to be occluded, centred on the central voxel. Should be
                a sequence, one value for each spatial dimension.
            occ_mode: might be used to calculate ``mul`` and ``add`` on the fly.
            activate: if ``True``, do softmax activation if num_channels > 1 else do ``sigmoid``. If ``False``, don't do any
                activation. If ``callable``, use callable on inferred outputs.
            module_kwargs: kwargs to be passed onto module when inferring
        """
        n_batch = cropped_grid.shape[0]
        sd = cropped_grid.ndim - 2
        # start with copies of x to infer
        im = torch.repeat_interleave(x, n_batch, 0)
        # get coordinates of top left corner of occluded region (possible because we use meshgrid)
        corner_coord_slices = [slice(None)] * 2 + [slice(1)] * sd
        top_corners = cropped_grid[corner_coord_slices]

        # replace occluded regions
        for b, t in enumerate(top_corners):
            # starting from corner, get the slices to extract the occluded region from the image
            slices = [slice(b, b + 1), slice(None)] + [slice(int(j), int(j) + m) for j, m in zip(t, mask_size)]
            to_occlude = im[slices]
            if occ_mode == "mean_patch":
                add, mul = OcclusionSensitivity.constant_occlusion(x, to_occlude.mean().item(), mask_size)

            if callable(occ_mode):
                to_occlude = occ_mode(x, to_occlude)
            else:
                to_occlude = to_occlude * mul + add
            if add is None or mul is None:
                raise RuntimeError("Shouldn't be here, something's gone wrong...")
            im[slices] = to_occlude
        # infer
        out: torch.Tensor = nn_module(im, **module_kwargs)

        # if activation is callable, call it
        if callable(activate):
            out = activate(out)
        # else if True (should be boolean), sigmoid if n_chan == 1 else softmax
        elif activate:
            out = out.sigmoid() if x.shape[1] == 1 else out.softmax(1)

        # the output will have shape [B,C] where C is number of channels output by model (inference classes)
        # we need to return it to sliding window inference with shape [B,C,H,W,[D]], so add dims and repeat values
        for m in mask_size:
            out = torch.repeat_interleave(out.unsqueeze(-1), m, dim=-1)

        return out



[docs]
    @staticmethod
    def crop_meshgrid(
        grid: MetaTensor, b_box: Sequence, mask_size: Sequence
    ) -> tuple[MetaTensor, SpatialCrop, Sequence]:
        """Crop the meshgrid so we only perform occlusion sensitivity on a subsection of the image."""
        # distance from center of mask to edge is -1 // 2.
        mask_edge = [(m - 1) // 2 for m in mask_size]
        bbox_min = [max(b - m, 0) for b, m in zip(b_box[::2], mask_edge)]
        bbox_max = []
        for b, m, s in zip(b_box[1::2], mask_edge, grid.shape[2:]):
            # if bbox is -ve for that dimension, no cropping so use current image size
            if b == -1:
                bbox_max.append(s)
            # else bounding box plus distance to mask edge. Make sure it's not bigger than the size of the image
            else:
                bbox_max.append(min(b + m, s))
        # bbox_max = [min(b + m, s) if b >= 0 else s for b, m, s in zip(b_box[1::2], mask_edge, grid.shape[2:])]
        # No need for batch and channel slices. Batch will be removed and added back in, and
        # SpatialCrop doesn't act on the first dimension anyway.
        slices = [slice(s, e) for s, e in zip(bbox_min, bbox_max)]
        cropper = SpatialCrop(roi_slices=slices)
        cropped: MetaTensor = cropper(grid[0])[None]  # type: ignore
        mask_size = list(mask_size)
        for i, s in enumerate(cropped.shape[2:]):
            mask_size[i] = min(s, mask_size[i])
        return cropped, cropper, mask_size


    def __call__(
        self, x: torch.Tensor, b_box: Sequence | None = None, **kwargs: Any
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: Image to use for inference. Should be a tensor consisting of 1 batch.
            b_box: Bounding box on which to perform the analysis. The output image will be limited to this size.
                There should be a minimum and maximum for all spatial dimensions: ``[min1, max1, min2, max2,...]``.
                * By default, the whole image will be used. Decreasing the size will speed the analysis up, which might
                    be useful for larger images.
                * Min and max are inclusive, so ``[0, 63, ...]`` will have size ``(64, ...)``.
                * Use -ve to use ``min=0`` and ``max=im.shape[x]-1`` for xth dimension.
                * N.B.: we add half of the mask size to the bounding box to ensure that the region of interest has a
                    sufficiently large area surrounding it.
            kwargs: any extra arguments to be passed on to the module as part of its `__call__`.

        Returns:
            * Occlusion map:
                * Shows the inference probabilities when the corresponding part of the image is occluded.
                    Hence, more -ve values imply that region was important in the decision process.
                * The map will have shape ``BCHW(D)N``, where N is the number of classes to be inferred by the
                    network. Hence, the occlusion for class ``i`` can be seen with ``map[...,i]``.
                * If `per_channel==False`, output ``C`` will equal 1: ``B1HW(D)N``
            * Most probable class:
                * The most probable class when the corresponding part of the image is occluded (``argmax(dim=-1)``).
            Both images will be cropped if a bounding box used, but voxel sizes will always match the input.
        """
        if x.shape[0] > 1:
            raise ValueError("Expected batch size of 1.")

        sd = x.ndim - 2
        mask_size: Sequence = ensure_tuple_rep(self.mask_size, sd)

        # get the meshgrid (so that sliding_window_inference can tell us which bit to occlude)
        grid: MetaTensor = MetaTensor(
            np.stack(np.meshgrid(*[np.arange(0, i) for i in x.shape[2:]], indexing="ij"))[None],
            device=x.device,
            dtype=x.dtype,
        )
        # if bounding box given, crop the grid to only infer subsections of the image
        if b_box is not None:
            grid, cropper, mask_size = self.crop_meshgrid(grid, b_box, mask_size)

        # check that the grid is bigger than the mask size
        if any(m > g for g, m in zip(grid.shape[2:], mask_size)):
            raise ValueError(f"Image (spatial shape) {grid.shape[2:]} should be bigger than mask {mask_size}.")

        # get additive and multiplicative factors if they are unchanged for all patches (i.e., not mean_patch)
        add: float | torch.Tensor | None
        mul: float | torch.Tensor | None
        # multiply by 0, add value
        if isinstance(self.mode, float):
            mul, add = self.constant_occlusion(x, self.mode, mask_size)
        # multiply by 0, add mean of image
        elif self.mode == "mean_img":
            mul, add = self.constant_occlusion(x, x.mean().item(), mask_size)
        # for gaussian, additive = 0, multiplicative = gaussian
        elif self.mode == "gaussian":
            mul, add = self.gaussian_occlusion(x, mask_size)
        # else will be determined on each patch individually so calculated later
        else:
            add, mul = None, None

        with eval_mode(self.nn_module):
            # needs to go here to avoid circular import
            from monai.inferers import sliding_window_inference

            sensitivity_im: MetaTensor = sliding_window_inference(  # type: ignore
                grid,
                roi_size=mask_size,
                sw_batch_size=self.n_batch,
                predictor=OcclusionSensitivity.predictor,
                overlap=self.overlap,
                mode="gaussian" if self.mode == "gaussian" else "constant",
                progress=self.verbose,
                nn_module=self.nn_module,
                x=x,
                add=add,
                mul=mul,
                mask_size=mask_size,
                occ_mode=self.mode,
                activate=self.activate,
                module_kwargs=kwargs,
            )

        if b_box is not None:
            # undo the cropping that was applied to the meshgrid
            sensitivity_im = cropper.inverse(sensitivity_im[0])[None]  # type: ignore
            # crop using the bounding box (ignoring the mask size this time)
            bbox_min = [max(b, 0) for b in b_box[::2]]
            bbox_max = [b if b > 0 else s for b, s in zip(b_box[1::2], x.shape[2:])]
            cropper = SpatialCrop(roi_start=bbox_min, roi_end=bbox_max)
            sensitivity_im = cropper(sensitivity_im[0])[None]  # type: ignore

        # The most probable class is the max in the classification dimension (1)
        most_probable_class = sensitivity_im.argmax(dim=1, keepdim=True)
        return sensitivity_im, most_probable_class