Source code for monai.networks.nets.torchvision_fc

# Copyright (c) MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from typing import Any

from monai.networks.nets import NetAdapter
from monai.utils import optional_import

models, _ = optional_import("torchvision.models")

__all__ = ["TorchVisionFCModel"]



[docs]
class TorchVisionFCModel(NetAdapter):
    """
    Customize the fully connected layer of (pretrained) TorchVision model or replace it by convolutional layer.

    This class supports two primary use cases:

        - use ``pool=None`` to indicate no modification in the pooling layers. It should be used with ``fc_name``
          to locate the target FC layer to modify:
          In this case, the class will load a torchvision classification model,
          replace the last fully connected (FC) layer with a new FC layer with ``num_classes`` outputs,
          example input arguments: ``use_conv=False, pool=None, fc_name="heads.head"``.
          The ``heads.head`` specifies the target FC of the input model, could be found by ``model.named_modules()``,
          for example::

              from torchvision.models import vit_b_16
              print([name[0] for name in vit_b_16().named_modules()])

        - use ``pool=""`` or set it to a tuple of pooling parameters to indicate modifications of both
          the pooling and the FC layer. It should be used with ``node_name`` to locate the model feature outputs:
          In this case, the class will load a torchvision model, remove the existing pooling and FC layers, and

          - append an additional convolution layer:
            ``use_conv=True, pool="", node_name="permute"``
          - append an additional pooling and FC layers:
            ``use_conv=False, pool=("avg", {"kernel_size": 7, "stride": 1}), node_name="permute"``
          - append an additional pooling and convolution layers:
            ``use_conv=True, pool=("avg", {"kernel_size": 7, "stride": 1}), node_name="permute"``

          The ``permute`` in the example is the target feature extraction node of the input
          `model_name`, could be found by using the torchvision feature extraction utilities, for example::

              from torchvision.models.feature_extraction import get_graph_node_names
              from torchvision.models import swin_t
              print(get_graph_node_names(swin_t())[0])

    Args:
        model_name: name of any torchvision model with fully connected layer at the end.
            ``resnet18`` (default), ``resnet34``, ``resnet50``, ``resnet101``, ``resnet152``,
            ``resnext50_32x4d``, ``resnext101_32x8d``, ``wide_resnet50_2``, ``wide_resnet101_2``, ``inception_v3``.
            model details: https://pytorch.org/vision/stable/models.html.
        num_classes: number of classes for the last classification layer. Default to 1.
        dim: number of supported spatial dimensions in the specified model, depends on the model implementation.
            default to 2 as most Torchvision models are for 2D image processing.
        in_channels: number of the input channels of last layer. if None, get it from `in_features` of last layer.
        use_conv: whether to use convolutional layer to replace the last layer, default to False.
        pool: parameters for the pooling layer, when it's a tuple, the first item is name of the pooling layer,
            the second item is dictionary of the initialization args. If None, will not replace the `layers[-2]`.
            default to `("avg", {"kernel_size": 7, "stride": 1})`. ``""`` indicates not adding a pooling layer.
        bias: the bias value when replacing the last layer. if False, the layer will not learn an additive bias,
            default to True.
        pretrained: whether to use the imagenet pretrained weights. Default to False.
        fc_name: the corresponding layer attribute of the last fully connected layer. Defaults to ``"fc"``.
        node_name: the corresponding feature extractor node name of `model`. Defaults to "", not in use.
        weights: additional weights enum for the torchvision model.
        kwargs: additional parameters for the torchvision model.

    Example::

        import torch
        from torchvision.models.inception import Inception_V3_Weights

        from monai.networks.nets import TorchVisionFCModel

        model = TorchVisionFCModel(
            "inception_v3",
            num_classes=4,
            weights=Inception_V3_Weights.IMAGENET1K_V1,
            use_conv=False,
            pool=None,
        )
        # model = TorchVisionFCModel("vit_b_16", num_classes=4, pool=None, in_channels=768, fc_name="heads")
        output = model.forward(torch.randn(2, 3, 299, 299))
        print(output.shape)  # torch.Size([2, 4])

    """

    def __init__(
        self,
        model_name: str = "resnet18",
        num_classes: int = 1,
        dim: int = 2,
        in_channels: int | None = None,
        use_conv: bool = False,
        pool: tuple[str, dict[str, Any]] | None = ("avg", {"kernel_size": 7, "stride": 1}),
        bias: bool = True,
        pretrained: bool = False,
        fc_name: str = "fc",
        node_name: str = "",
        weights=None,
        **kwargs,
    ):
        if weights is not None:
            model = getattr(models, model_name)(weights=weights, **kwargs)
        elif pretrained:
            model = getattr(models, model_name)(weights="DEFAULT", **kwargs)
        else:
            model = getattr(models, model_name)(weights=None, **kwargs)

        super().__init__(
            model=model,
            num_classes=num_classes,
            dim=dim,
            in_channels=in_channels,
            use_conv=use_conv,
            pool=pool,
            bias=bias,
            fc_name=fc_name,
            node_name=node_name,
        )