# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""grad freeze"""
import numpy as np
from mindspore.nn.cell import Cell
from mindspore.nn.optim import Optimizer
from mindspore.common import Tensor
from mindspore.common import dtype as mstype
from mindspore.nn.optim import LARS
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
from mindspore.ops import functional as F
from .base import ParameterProcess
from .grad_accumulation import GradientAccumulation
__all__ = ['GradientFreeze', 'FreezeOpt', 'freeze_cell']
CONTINUOUS_STRATEGY = 0
INTERVAL_STRATEGY = 1
[docs]class FreezeOpt(Cell):
"""
Optimizer that supports gradients freezing training.
Args:
opt (Cell): non-freezing optimizer instance, such as 'Momentum', 'SGD'.
train_parameter_groups (Union[tuple, list]): Groups of parameters for gradients freezing training.
train_strategy (Union[tuple(int), list(int), Tensor]): Strategy for gradients freezing training.
Supported Platforms:
``Ascend``
"""
def __init__(self, opt, train_parameter_groups=None, train_strategy=None):
super(FreezeOpt, self).__init__()
if not isinstance(opt, Optimizer):
raise TypeError(
f"The first arg 'opt' must be an Optimizer instance, but got {type(opt)}")
if train_strategy is not None and train_parameter_groups is None:
raise ValueError("When the 'train_strategy' is specified, the value of 'train_parameter_groups' "
"must also be specified")
if isinstance(opt, LARS):
self.is_lars = True
self.opt_class = type(opt.opt)
self.opt_init_args = opt.opt.init_args
self.lars_init_args = opt.init_args
self.parameters = opt.opt.parameters
else:
self.is_lars = False
self.opt_class = type(opt)
self.opt_init_args = opt.init_args
self.parameters = opt.parameters
self.opts = []
if train_parameter_groups is None:
groups_num = 10
step = 6
parameters = opt.parameters
para_groups = (parameters[(i * step):] for i in range(groups_num))
self.opts = [self._generate_new_optimizer(
params) for params in para_groups]
else:
if not isinstance(train_parameter_groups, (tuple, list)):
raise TypeError(
"The specified 'train_parameter_groups' should be tuple or list")
for params in train_parameter_groups:
if not isinstance(params, (tuple, list)):
raise TypeError("The each element of 'train_parameter_groups' should be tuple or list "
"to store the Parameter")
# generate one-to-one opt corresponding to the parameter group
self.opts.append(self._generate_new_optimizer(params))
if isinstance(train_strategy, (tuple, list)):
for ele in train_strategy:
if not isinstance(ele, int):
raise ValueError(
"The element in train_strategy should be int number")
self.train_strategy = Tensor(train_strategy, mstype.int32)
elif isinstance(train_strategy, Tensor):
if train_strategy.ndim != 1 or train_strategy.dtype != mstype.int32:
raise ValueError("When train_strategy is a Tensor, the dimension should be 1 and "
"the dtype should be int32")
self.train_strategy = train_strategy
elif train_strategy is None:
self.train_strategy = None
else:
raise TypeError(
"The specified 'train_strategy' should be None, tuple, list or Tensor")
def _generate_new_optimizer(self, params):
"""Generate new optimizer."""
if not self.is_lars:
opt = self.opt_class(params=params, **self.opt_init_args)
else:
opt = LARS(self.opt_class(params=params, **self.opt_init_args),
**self.lars_init_args)
return opt
class _TrainFreezeCell(Cell):
r"""
Gradient freezing training network.
Args:
net (Cell): The training network.
sens (numbers.Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0.
grad (tuple(Tensor)): The gradients of network parameters and inputs.
grad_reducer (Cell): Constructs a gradient reducer Cell, which applies communication and average operations on
single-process gradient values.
use_grad_accumulation (bool): Whether use grad accumulation.
optimizer (Union[Cell]): Optimizer for updating the weights.
max_accumulation_step (numbers.Number): Max grad accumulation steps. Default: 1.0
Supported Platforms:
``Ascend``
"""
def __init__(self, net, sens, grad, grad_reducer, use_grad_accumulation, optimizer, max_accumulation_step=1):
super(_TrainFreezeCell, self).__init__(auto_prefix=False)
self.net = net
self.grad = grad
self.grad_reducer = grad_reducer
self.opt = optimizer
self.parameters = optimizer.parameters
self.sens = sens
self.use_grad_accumulation = use_grad_accumulation
self.max_accumulation_step = max_accumulation_step
if use_grad_accumulation:
self.grad_accumulation = GradientAccumulation(
self.max_accumulation_step, self.optimizer)
def construct(self, *inputs):
loss = self.net(*inputs)
sens = F.fill(loss.dtype, loss.shape, self.sens)
grads = self.grad(self.net, self.parameters)(*inputs, sens)
grads = self.grad_reducer(grads)
if self.use_grad_accumulation:
loss = self.grad_accumulation(loss, grads)
else:
loss = F.depend(loss, self.opt(grads))
return loss
[docs]class GradientFreeze:
r"""
Freezing the gradients of some layers randomly. The number and
probability of frozen layers can be configured by users
Args:
param_groups (Union[tuple, list]): Groups of parameters for gradients freezing training.
freeze_type (int): Strategy of gradients freezing training.
freeze_p (float): probability of gradients freezing training.
total_steps (numbers.Number): Steps of the whole training.
Examples:
>>> gradient_freeze_class = boost.GradientFreeze(10, 1, 0.5, 2000)
>>> network, optimizer = gradient_freeze_class.freeze_generate(network, optimizer)
"""
def __init__(self, param_groups, freeze_type, freeze_p, total_steps):
self._param_groups = param_groups
self._freeze_type = freeze_type
self._freeze_p = freeze_p
self._total_steps = total_steps
self.grad_reducer = F.identity
self._param_processer = ParameterProcess()
[docs] def split_parameters_groups(self, net, freeze_para_groups_number):
r"""
Split parameter groups for gradients freezing training.
Args:
net (Cell): The training network.
freeze_para_groups_number (int): The number of gradient freeze groups.
"""
grouped_params = []
tmp = []
for para in net.trainable_params():
name = para.name
# ensure 'bn' after 'conv' is not split
if 'bn' in name or 'bias' in name:
tmp.append(para)
elif len(tmp) >= 3:
grouped_params.append(tmp)
tmp = [para]
else:
tmp.append(para)
if tmp:
grouped_params.append(tmp)
stride = len(grouped_params) // freeze_para_groups_number
freeze_grouped_params = [sum(grouped_params[i * stride:], [])
for i in range(freeze_para_groups_number)]
return freeze_grouped_params
[docs] def generate_freeze_index_sequence(self, parameter_groups_number, freeze_strategy, freeze_p, total_steps):
r"""
Generate index sequence for gradient freezing training.
Args:
parameter_groups_number (int): The number of parameter groups.
freeze_strategy (int): Gradient freeze grouping strategy, select from [0, 1].
freeze_p (float): Gradient freezing probability.
total_steps (int): Total training steps.
"""
total_step = int(total_steps * 1.01)
if parameter_groups_number <= 1:
return [0 for _ in range(total_step)]
# local continuous freezing training strategy, as '00001234'
if freeze_strategy == CONTINUOUS_STRATEGY:
zero_cnt = int(
freeze_p * (parameter_groups_number - 1) / (1 - freeze_p) + 0.5)
sub_idx = [0] * zero_cnt + list(range(1, parameter_groups_number))
freeze_idxes = []
while len(freeze_idxes) < total_step:
freeze_idxes += sub_idx
return freeze_idxes
# interval freezing training strategy, as '01020304'
if freeze_strategy == INTERVAL_STRATEGY:
index_all = list(range(1, parameter_groups_number))
prob = [x / sum(index_all) for x in index_all]
freeze_idxes = [0]
zero_cnt = 1
freeze_cnt = 0
while len(freeze_idxes) < total_step:
freeze_p_cur = 1.0 * freeze_cnt / (zero_cnt + freeze_cnt)
if freeze_p_cur < 1 - freeze_p:
freeze_idxes.append(
int(np.random.choice(index_all[::-1], p=prob)))
freeze_cnt += 1
else:
freeze_idxes.append(0)
zero_cnt += 1
return freeze_idxes
raise ValueError(
f"Unsupported freezing training strategy '{freeze_strategy}'")
[docs] def freeze_generate(self, network, optimizer):
r"""
Generate freeze network and optimizer.
Args:
network (Cell): The training network.
optimizer (Cell): Optimizer for updating the weights.
"""
train_para_groups = self.split_parameters_groups(
network, self._param_groups)
for i in range(self._param_groups):
train_para_groups[i] = self._param_processer.generate_group_params(train_para_groups[i],
optimizer.init_params['params'])
train_strategy = self.generate_freeze_index_sequence(
self._param_groups, self._freeze_type, self._freeze_p, self._total_steps)
optimizer = FreezeOpt(optimizer, train_para_groups, train_strategy)
return network, optimizer
[docs]def freeze_cell(reducer_flag, network, optimizer, sens, grad, use_grad_accumulation, mean=None, degree=None,
max_accumulation_step=1):
r"""
Generate freeze network and optimizer.
Args:
reducer_flag (bool): Reducer flag.
network (Cell): The training network.
optimizer (Cell): Optimizer for updating the weights.
sens (numbers.Number): The scaling number.
grad (tuple(Tensor)): Tuple of gradient tensors.
use_grad_accumulation (bool): Use gradient accumulation flag.
mean (bool): Gradients mean flag. default: None.
degree (int): Device number. default: None.
max_accumulation_step (int): Max accumulation steps. default: 1.
Examples:
>>> import numpy as np
>>> from mindspore import Tensor, Parameter, nn
>>> import mindspore.ops as ops
>>> from mindspore.boost.grad_freeze import freeze_cell
>>>
>>> class Net(nn.Cell):
... def __init__(self, in_features, out_features):
... super(Net, self).__init__()
... self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)),
... name='weight')
... self.matmul = ops.MatMul()
...
... def construct(self, x):
... output = self.matmul(x, self.weight)
... return output
...
>>> in_features, out_features = 16, 10
>>> network = Net(in_features, out_features)
>>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
>>> grad = ops.GradOperation(get_by_list=True, sens_param=True)
>>> freeze_nets = freeze_cell(False, network, optimizer, 1.0, grad, False, None, None, 1)
"""
if reducer_flag:
param_processer = ParameterProcess()
grad_reducers = (DistributedGradReducer(param_processer.assign_parameter_group(opt.parameters),
mean, degree) for opt in optimizer.opts)
freeze_nets = tuple(_TrainFreezeCell(network, sens, grad, reducer,
use_grad_accumulation, opt, max_accumulation_step)
for reducer, opt in zip(grad_reducers, optimizer.opts))
else:
freeze_nets = tuple(_TrainFreezeCell(network, sens, grad, F.identity,
use_grad_accumulation, opt, max_accumulation_step)
for opt in optimizer.opts)
return freeze_nets