Source code for mindarmour.attacks.carlini_wagner

# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Carlini-wagner Attack.
"""
import numpy as np

from mindspore import Tensor
from mindspore.nn import Cell

from mindarmour.attacks.attack import Attack
from mindarmour.utils.logger import LogUtil
from mindarmour.utils._check_param import check_numpy_param, check_model, \
    check_pair_numpy_param, check_int_positive, check_param_type, \
    check_param_multi_types, check_value_positive, check_equal_shape
from mindarmour.utils.util import GradWrap
from mindarmour.utils.util import jacobian_matrix

LOGGER = LogUtil.get_instance()
TAG = 'CW'


def _best_logits_of_other_class(logits, target_class, value=1):
    """
    Choose the index of the largest logits exclude target class.

    Args:
        logits (numpy.ndarray): Predict logits of samples.
        target_class (numpy.ndarray): Target labels.
        value (float): Maximum value of output logits. Default: 1.

    Returns:
        numpy.ndarray, the index of the largest logits exclude the target
        class.

    Examples:
        >>> other_class = _best_logits_of_other_class([[0.2, 0.3, 0.5],
        >>> [0.3, 0.4, 0.3]], [2, 1])
    """
    LOGGER.debug(TAG, "enter the func _best_logits_of_other_class.")
    logits, target_class = check_pair_numpy_param('logits', logits,
                                                  'target_class', target_class)
    res = np.zeros_like(logits)
    for i in range(logits.shape[0]):
        res[i][target_class[i]] = value
    return np.argmax(logits - res, axis=1)


[docs]class CarliniWagnerL2Attack(Attack):
    """
    The Carlini & Wagner attack using L2 norm.

    References: `Nicholas Carlini, David Wagner: "Towards Evaluating
    the Robustness of Neural Networks" <https://arxiv.org/abs/1608.04644>`_

    Args:
        network (Cell): Target model.
        num_classes (int): Number of labels of model output, which should be
            greater than zero.
        box_min (float): Lower bound of input of the target model. Default: 0.
        box_max (float): Upper bound of input of the target model. Default: 1.0.
        bin_search_steps (int): The number of steps for the binary search
            used to find the optimal trade-off constant between distance
            and confidence. Default: 5.
        max_iterations (int): The maximum number of iterations, which should be
            greater than zero. Default: 1000.
        confidence (float): Confidence of the output of adversarial examples.
            Default: 0.
        learning_rate (float): The learning rate for the attack algorithm.
            Default: 5e-3.
        initial_const (float): The initial trade-off constant to use to balance
            the relative importance of perturbation norm and confidence
            difference. Default: 1e-2.
        abort_early_check_ratio (float): Check loss progress every ratio of
            all iteration. Default: 5e-2.
        targeted (bool): If True, targeted attack. If False, untargeted attack.
            Default: False.
        fast (bool): If True, return the first found adversarial example.
            If False, return the adversarial samples with smaller
            perturbations. Default: True.
        abort_early (bool): If True, Adam will be aborted if the loss hasn't
            decreased for some time. If False, Adam will continue work until the
            max iterations is arrived. Default: True.
        sparse (bool): If True, input labels are sparse-coded. If False,
            input labels are onehot-coded. Default: True.

    Examples:
        >>> attack = CarliniWagnerL2Attack(network)
    """

    def __init__(self, network, num_classes, box_min=0.0, box_max=1.0,
                 bin_search_steps=5, max_iterations=1000, confidence=0,
                 learning_rate=5e-3, initial_const=1e-2,
                 abort_early_check_ratio=5e-2, targeted=False,
                 fast=True, abort_early=True, sparse=True):
        LOGGER.info(TAG, "init CW object.")
        super(CarliniWagnerL2Attack, self).__init__()
        self._network = check_model('network', network, Cell)
        self._network.set_grad(True)
        self._num_classes = check_int_positive('num_classes', num_classes)
        self._min = check_param_type('box_min', box_min, float)
        self._max = check_param_type('box_max', box_max, float)
        self._bin_search_steps = check_int_positive('search_steps',
                                                    bin_search_steps)
        self._max_iterations = check_int_positive('max_iterations',
                                                  max_iterations)
        self._confidence = check_param_multi_types('confidence', confidence,
                                                   [int, float])
        self._learning_rate = check_value_positive('learning_rate',
                                                   learning_rate)
        self._initial_const = check_value_positive('initial_const',
                                                   initial_const)
        self._abort_early = check_param_type('abort_early', abort_early, bool)
        self._fast = check_param_type('fast', fast, bool)
        self._abort_early_check_ratio = check_value_positive('abort_early_check_ratio',
                                                             abort_early_check_ratio)
        self._targeted = check_param_type('targeted', targeted, bool)
        self._net_grad = GradWrap(self._network)
        self._sparse = check_param_type('sparse', sparse, bool)
        self._dtype = None

    def _loss_function(self, logits, new_x, org_x, org_or_target_class,
                       constant, confidence):
        """
        Calculate the value of loss function and gradients of loss w.r.t inputs.

        Args:
            logits (numpy.ndarray): The output of network before softmax.
            new_x (numpy.ndarray): Adversarial examples.
            org_x (numpy.ndarray): Original benign input samples.
            org_or_target_class (numpy.ndarray): Original/target labels.
            constant (float): A trade-off constant to use to balance loss
                and perturbation norm.
            confidence (float): Confidence level of the output of adversarial
                examples.

        Returns:
            numpy.ndarray, norm of perturbation, sum of the loss and the
            norm, and gradients of the sum w.r.t inputs.

        Raises:
            ValueError: If loss is less than 0.

        Examples:
            >>> L2_loss, total_loss, dldx = self._loss_function([0.2 , 0.3,
            >>> 0.5], [0.1, 0.2, 0.2, 0.4], [0.12, 0.2, 0.25, 0.4], [1], 2, 0)
        """
        LOGGER.debug(TAG, "enter the func _loss_function.")

        logits = check_numpy_param('logits', logits)
        org_x = check_numpy_param('org_x', org_x)
        new_x, org_or_target_class = check_pair_numpy_param('new_x',
                                                            new_x,
                                                            'org_or_target_class',
                                                            org_or_target_class)

        new_x, org_x = check_equal_shape('new_x', new_x, 'org_x', org_x)

        other_class_index = _best_logits_of_other_class(
            logits, org_or_target_class, value=np.inf)
        loss1 = np.sum((new_x - org_x)**2,
                       axis=tuple(range(len(new_x.shape))[1:]))
        loss2 = np.zeros_like(loss1, dtype=self._dtype)
        loss2_grade = np.zeros_like(new_x, dtype=self._dtype)
        jaco_grad = jacobian_matrix(self._net_grad, new_x, self._num_classes)
        if self._targeted:
            for i in range(org_or_target_class.shape[0]):
                loss2[i] = max(0, logits[i][other_class_index[i]]
                               - logits[i][org_or_target_class[i]]
                               + confidence)
                loss2_grade[i] = constant[i]*(jaco_grad[other_class_index[
                    i]][i] - jaco_grad[org_or_target_class[i]][i])
        else:
            for i in range(org_or_target_class.shape[0]):
                loss2[i] = max(0, logits[i][org_or_target_class[i]]
                               - logits[i][other_class_index[i]] + confidence)
                loss2_grade[i] = constant[i]*(jaco_grad[org_or_target_class[
                    i]][i] - jaco_grad[other_class_index[i]][i])
        total_loss = loss1 + constant*loss2
        loss1_grade = 2*(new_x - org_x)
        for i in range(org_or_target_class.shape[0]):
            if loss2[i] < 0:
                msg = 'loss value should greater than or equal to 0, ' \
                      'but got loss2 {}'.format(loss2[i])
                LOGGER.error(TAG, msg)
                raise ValueError(msg)
            if loss2[i] == 0:
                loss2_grade[i, ...] = 0
        total_loss_grade = loss1_grade + loss2_grade
        return loss1, total_loss, total_loss_grade

    def _to_attack_space(self, inputs):
        """
        Transform input data into attack space.

        Args:
            inputs (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray, transformed data which belongs to attack space.

        Examples:
            >>> x_att = self._to_attack_space([0.2, 0.3, 0.3])
        """
        LOGGER.debug(TAG, "enter the func _to_attack_space.")

        inputs = check_numpy_param('inputs', inputs)
        mean = (self._min + self._max) / 2
        diff = (self._max - self._min) / 2
        inputs = (inputs - mean) / diff
        inputs = inputs*0.999999
        return np.arctanh(inputs)

    def _to_model_space(self, inputs):
        """
        Transform input data into model space.

        Args:
            inputs (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray, transformed data which belongs to model space
            and the gradient of x_model w.r.t. x_att.

        Examples:
            >>> x_att = self._to_model_space([10, 21, 9])
        """
        LOGGER.debug(TAG, "enter the func _to_model_space.")

        inputs = check_numpy_param('inputs', inputs)
        inputs = np.tanh(inputs)
        the_grad = 1 - np.square(inputs)
        mean = (self._min + self._max) / 2
        diff = (self._max - self._min) / 2
        inputs = inputs*diff + mean
        the_grad = the_grad*diff
        return inputs, the_grad

[docs]    def generate(self, inputs, labels):
        """
        Generate adversarial examples based on input data and targeted labels.

        Args:
            inputs (numpy.ndarray): Input samples.
            labels (numpy.ndarray): The ground truth label of input samples
                or target labels.

        Returns:
            numpy.ndarray, generated adversarial examples.

        Examples:
            >>> advs = attack.generate([[0.1, 0.2, 0.6], [0.3, 0, 0.4]], [1, 2]]
        """

        LOGGER.debug(TAG, "enter the func generate.")
        inputs, labels = check_pair_numpy_param('inputs', inputs,
                                                'labels', labels)
        if not self._sparse:
            labels = np.argmax(labels, axis=1)
        self._dtype = inputs.dtype
        att_original = self._to_attack_space(inputs)
        reconstructed_original, _ = self._to_model_space(att_original)

        # find an adversarial sample
        const = np.ones_like(labels, dtype=self._dtype)*self._initial_const
        lower_bound = np.zeros_like(labels, dtype=self._dtype)
        upper_bound = np.ones_like(labels, dtype=self._dtype)*np.inf
        adversarial_res = inputs.copy()
        adversarial_loss = np.ones_like(labels, dtype=self._dtype)*np.inf
        samples_num = labels.shape[0]
        adv_flag = np.zeros_like(labels)
        for binary_search_step in range(self._bin_search_steps):
            if (binary_search_step == self._bin_search_steps - 1) and \
                    (self._bin_search_steps >= 10):
                const = min(1e10, upper_bound)
            LOGGER.debug(TAG,
                         'starting optimization with const = %s',
                         str(const))

            att_perturbation = np.zeros_like(att_original, dtype=self._dtype)
            loss_at_previous_check = np.ones_like(labels, dtype=self._dtype)*np.inf

            # create a new optimizer to minimize the perturbation
            optimizer = _AdamOptimizer(att_perturbation.shape)

            for iteration in range(self._max_iterations):
                x_input, dxdp = self._to_model_space(
                    att_original + att_perturbation)
                logits = self._network(Tensor(x_input)).asnumpy()

                current_l2_loss, current_loss, dldx = self._loss_function(
                    logits, x_input, reconstructed_original,
                    labels, const, self._confidence)

                # check if attack success (include all examples)
                if self._targeted:
                    is_adv = (np.argmax(logits, axis=1) == labels)
                else:
                    is_adv = (np.argmax(logits, axis=1) != labels)

                for i in range(samples_num):
                    if is_adv[i]:
                        adv_flag[i] = True
                        if current_l2_loss[i] < adversarial_loss[i]:
                            adversarial_res[i] = x_input[i]
                            adversarial_loss[i] = current_l2_loss[i]

                if np.all(adv_flag):
                    if self._fast:
                        LOGGER.debug(TAG, "succeed find adversarial examples.")
                        msg = 'iteration: {}, logits_att: {}, ' \
                              'loss: {}, l2_dist: {}' \
                            .format(iteration,
                                    np.argmax(logits, axis=1),
                                    current_loss, current_l2_loss)
                        LOGGER.debug(TAG, msg)
                        return adversarial_res

                dldx, inputs = check_equal_shape('dldx', dldx, 'inputs', inputs)

                gradient = dldx*dxdp
                att_perturbation += \
                    optimizer(gradient, self._learning_rate)

                # check if should stop iteration early
                flag = True
                iter_check = iteration % (np.ceil(
                    self._max_iterations*self._abort_early_check_ratio))
                if self._abort_early and iter_check == 0:
                    # check progress
                    for i in range(inputs.shape[0]):
                        if current_loss[i] <= .9999*loss_at_previous_check[i]:
                            flag = False
                    # stop Adam if all samples has no progress
                    if flag:
                        LOGGER.debug(TAG,
                                     'step:%d, no progress yet, stop iteration',
                                     binary_search_step)
                        break
                    loss_at_previous_check = current_loss

            for i in range(samples_num):
                # update bound based on search result
                if adv_flag[i]:
                    LOGGER.debug(TAG,
                                 'example %d, found adversarial with const=%f',
                                 i, const[i])
                    upper_bound[i] = const[i]
                else:
                    LOGGER.debug(TAG,
                                 'example %d, failed to find adversarial'
                                 ' with const=%f',
                                 i, const[i])
                    lower_bound[i] = const[i]

                if upper_bound[i] == np.inf:
                    const[i] *= 10
                else:
                    const[i] = (lower_bound[i] + upper_bound[i]) / 2

        return adversarial_res


class _AdamOptimizer:
    """
    AdamOptimizer is used to calculate the optimum attack step.

    Args:
        shape (tuple): The shape of perturbations.

    Examples:
        >>> optimizer = _AdamOptimizer(att_perturbation.shape)
    """

    def __init__(self, shape):
        self._m = np.zeros(shape)
        self._v = np.zeros(shape)
        self._t = 0

    def __call__(self, gradient, learning_rate=0.001,
                 beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Calculate the optimum perturbation for each iteration.

        Args:
            gradient (numpy.ndarray): The gradient of the loss w.r.t. to the
                variable.
            learning_rate (float): The learning rate in the current iteration.
                Default: 0.001.
            beta1 (float): Decay rate for calculating the exponentially
                decaying average of past gradients. Default: 0.9.
            beta2 (float): Decay rate for calculating the exponentially
                decaying average of past squared gradients. Default: 0.999.
            epsilon (float): Small value to avoid division by zero.
                Default: 1e-8.

        Returns:
            numpy.ndarray, perturbations.

        Examples:
            >>> perturbs = optimizer([0.2, 0.1, 0.15], 0.005)
        """
        gradient = check_numpy_param('gradient', gradient)
        self._t += 1
        self._m = beta1*self._m + (1 - beta1)*gradient
        self._v = beta2*self._v + (1 - beta2)*gradient**2
        alpha = learning_rate*np.sqrt(1 - beta2**self._t) / (1 - beta1**self._t)
        pertur = -alpha*self._m / (np.sqrt(self._v) + epsilon)
        return pertur