Source code for mindvision.classification.models.vision_transform

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Vision Transformer."""

from typing import Optional
import ml_collections as collections

from mindspore import nn

from mindvision.classification.models.backbones.vit import ViT
from mindvision.classification.models.classifiers import BaseClassifier
from mindvision.classification.models.head import DenseHead, MultilayerDenseHead
from mindvision.classification.utils.model_urls import model_urls
from mindvision.utils.load_pretrained_model import LoadPretrainedModel

__all__ = [
    'vit_b_16',
    'vit_l_16',
    'vit_b_32',
    'vit_l_32',
]


def vit(image_size: int,
        input_channels: int,
        patch_size: int,
        embed_dim: int,
        num_layers: int,
        num_heads: int,
        num_classes: int,
        mlp_dim: int,
        dropout: float = 0.,
        attention_dropout: float = 0.,
        drop_path_dropout: float = 0.,
        activation: nn.Cell = nn.GELU,
        norm: nn.Cell = nn.LayerNorm,
        pool: str = 'cls',
        representation_size: Optional[int] = None,
        pretrained: bool = False,
        arch: str = None) -> ViT:
    """Vision Transformer architecture."""
    backbone = ViT(image_size=image_size,
                   input_channels=input_channels,
                   patch_size=patch_size,
                   embed_dim=embed_dim,
                   num_layers=num_layers,
                   num_heads=num_heads,
                   mlp_dim=mlp_dim,
                   keep_prob=1.0 - dropout,
                   attention_keep_prob=1.0 - attention_dropout,
                   drop_path_keep_prob=1.0 - drop_path_dropout,
                   activation=activation,
                   norm=norm,
                   pool=pool)
    if representation_size:
        head = MultilayerDenseHead(input_channel=embed_dim,
                                   num_classes=num_classes,
                                   mid_channel=[representation_size],
                                   activation=['tanh', None],
                                   keep_prob=[1.0, 1.0])
    else:
        head = DenseHead(input_channel=embed_dim,
                         num_classes=num_classes)

    model = BaseClassifier(backbone=backbone, head=head)

    if pretrained:
        # Download the pre-trained checkpoint file from url, and load ckpt file.
        LoadPretrainedModel(model, model_urls[arch]).run()

    return model


[docs]def vit_b_16(num_classes: int = 1000, image_size: int = 224, has_logits: bool = False, pretrained: bool = False, drop_out: float = 0.0, attention_dropout: float = 0.0, drop_path_dropout: float = 0.0 ) -> ViT: """ Constructs a vit_b_16 architecture from `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_. Args: image_size (int): The input image size. Default: 224 for ImageNet. num_classes (int): The number of classification. Default: 1000. has_logits (bool): Whether has logits or not. Default: False. pretrained (bool): Whether to download and load the pre-trained model. Default: False. drop_out (float): The drop out rate. Default: 0.0. attention_dropout (float): The attention dropout rate. Default: 0.0. drop_path_dropout (float): The stochastic depth rate. Default: 0.0. Inputs: - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`. Outputs: Tensor of shape :math:`(N, CLASSES_{out})` Supported Platforms: ``GPU`` Examples: >>> import numpy as np >>> >>> import mindspore as ms >>> from mindvision.classification.models import vit_b_16 >>> >>> net = vit_b_16() >>> x = ms.Tensor(np.ones([1, 3, 224, 224]), ms.float32) >>> output = net(x) >>> print(output.shape) (1, 1000) About ViT: Vision Transformer (ViT) shows that a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train. Citation: .. code-block:: @article{2020An, title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, author={Dosovitskiy, A. and Beyer, L. and Kolesnikov, A. and Weissenborn, D. and Houlsby, N.}, year={2020}, } """ config = collections.ConfigDict() config.arch = "vit_b_16_" + str(image_size) config.image_size = image_size config.num_classes = num_classes config.patch_size = 16 config.embed_dim = 768 config.mlp_dim = 3072 config.num_heads = 12 config.num_layers = 12 config.dropout = drop_out config.attention_dropout = attention_dropout config.drop_path_dropout = drop_path_dropout config.pretrained = pretrained config.input_channels = 3 config.pool = 'cls' config.representation_size = 768 if has_logits else None return vit(**config)
[docs]def vit_l_16(num_classes: int = 1000, image_size: int = 224, has_logits: bool = False, pretrained: bool = False, drop_out: float = 0.0, attention_dropout: float = 0.0, drop_path_dropout: float = 0.0 ) -> ViT: """ Constructs a vit_l_16 architecture from `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_. Args: image_size (int): The input image size. Default: 224 for ImageNet. num_classes (int): The number of classification. Default: 1000. has_logits (bool): Whether has logits or not. Default: False. pretrained (bool): Whether to download and load the pre-trained model. Default: False. drop_out (float): The drop out rate. Default: 0.0. attention_dropout (float): The attention dropout rate. Default: 0.0. drop_path_dropout (float): The stochastic depth rate. Default: 0.0. Inputs: - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`. Outputs: Tensor of shape :math:`(N, CLASSES_{out})` Supported Platforms: ``GPU`` Examples: >>> import numpy as np >>> >>> import mindspore as ms >>> from mindvision.classification.models import vit_l_16 >>> >>> net = vit_l_16() >>> x = ms.Tensor(np.ones([1, 3, 224, 224]), ms.float32) >>> output = net(x) >>> print(output.shape) (1, 1000) About ViT: Vision Transformer (ViT) shows that a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train. Citation: .. code-block:: @article{2020An, title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, author={Dosovitskiy, A. and Beyer, L. and Kolesnikov, A. and Weissenborn, D. and Houlsby, N.}, year={2020}, } """ config = collections.ConfigDict() config.arch = 'vit_l_16_' + str(image_size) config.image_size = image_size config.num_classes = num_classes config.patch_size = 16 config.embed_dim = 1024 config.mlp_dim = 4096 config.num_heads = 16 config.num_layers = 24 config.dropout = drop_out config.attention_dropout = attention_dropout config.drop_path_dropout = drop_path_dropout config.input_channels = 3 config.pool = 'cls' config.pretrained = pretrained config.representation_size = 1024 if has_logits else None return vit(**config)
[docs]def vit_b_32(num_classes: int = 1000, image_size: int = 224, has_logits: bool = False, pretrained: bool = False, drop_out: float = 0.0, attention_dropout: float = 0.0, drop_path_dropout: float = 0.0 ) -> ViT: """ Constructs a vit_b_32 architecture from `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_. Args: image_size (int): The input image size. Default: 224 for ImageNet. num_classes (int): The number of classification. Default: 1000. has_logits (bool): Whether has logits or not. Default: False. pretrained (bool): Whether to download and load the pre-trained model. Default: False. drop_out (float): The drop out rate. Default: 0.0. attention_dropout (float): The attention dropout rate. Default: 0.0. drop_path_dropout (float): The stochastic depth rate. Default: 0.0. Inputs: - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`. Outputs: Tensor of shape :math:`(N, CLASSES_{out})` Supported Platforms: ``GPU`` Examples: >>> import numpy as np >>> >>> import mindspore as ms >>> from mindvision.classification.models import vit_b_32 >>> >>> net = vit_b_32() >>> x = ms.Tensor(np.ones([1, 3, 224, 224]), ms.float32) >>> output = net(x) >>> print(output.shape) (1, 1000) About ViT: Vision Transformer (ViT) shows that a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train. Citation: .. code-block:: @article{2020An, title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, author={Dosovitskiy, A. and Beyer, L. and Kolesnikov, A. and Weissenborn, D. and Houlsby, N.}, year={2020}, } """ config = collections.ConfigDict() config.arch = 'vit_b_32_' + str(image_size) config.image_size = image_size config.num_classes = num_classes config.patch_size = 32 config.embed_dim = 768 config.mlp_dim = 3072 config.num_heads = 12 config.num_layers = 12 config.dropout = drop_out config.attention_dropout = attention_dropout config.drop_path_dropout = drop_path_dropout config.pretrained = pretrained config.input_channels = 3 config.pool = 'cls' config.representation_size = 768 if has_logits else None return vit(**config)
[docs]def vit_l_32(num_classes: int = 1000, image_size: int = 224, has_logits: bool = False, pretrained: bool = False, drop_out: float = 0.0, attention_dropout: float = 0.0, drop_path_dropout: float = 0.0 ) -> ViT: """ Constructs a vit_l_32 architecture from `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_. Args: image_size (int): The input image size. Default: 224 for ImageNet. num_classes (int): The number of classification. Default: 1000. has_logits (bool): Whether has logits or not. Default: False. pretrained (bool): Whether to download and load the pre-trained model. Default: False. drop_out (float): The drop out rate. Default: 0.0. attention_dropout (float): The attention dropout rate. Default: 0.0. drop_path_dropout (float): The stochastic depth rate. Default: 0.0. Inputs: - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`. Outputs: Tensor of shape :math:`(N, CLASSES_{out})` Supported Platforms: ``GPU`` Examples: >>> import numpy as np >>> >>> import mindspore as ms >>> from mindvision.classification.models import vit_l_32 >>> >>> net = vit_l_32() >>> x = ms.Tensor(np.ones([1, 3, 224, 224]), ms.float32) >>> output = net(x) >>> print(output.shape) (1, 1000) About ViT: Vision Transformer (ViT) shows that a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train. Citation: .. code-block:: @article{2020An, title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, author={Dosovitskiy, A. and Beyer, L. and Kolesnikov, A. and Weissenborn, D. and Houlsby, N.}, year={2020}, } """ config = collections.ConfigDict() config.arch = 'vit_l_32_' + str(image_size) config.image_size = image_size config.num_classes = num_classes config.patch_size = 32 config.embed_dim = 1024 config.mlp_dim = 4096 config.num_heads = 16 config.num_layers = 24 config.dropout = drop_out config.attention_dropout = attention_dropout config.drop_path_dropout = drop_path_dropout config.pretrained = pretrained config.input_channels = 3 config.pool = 'cls' config.representation_size = 1024 if has_logits else None return vit(**config)