# Copyright 2024 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""OptTFTWrapper"""
from __future__ import absolute_import
import os
from mindspore.common.tensor import Tensor
from mindspore.nn.optim.optimizer import Optimizer
from mindspore.ops.operations.manually_defined._inner import TensorReport
from mindspore import ops, context
[docs]class OptTFTWrapper(Optimizer):
r"""
Implements TFT optimizer wrapper, this wrapper is used to report status to MindIO TFT before optimizer updating.
Note:
This optimizer is depend on MindIO TFT feature. Currently only support ascend graph mode and
sink_size must be less than 1.
Args:
opt (Optimizer): Must be sub-class of Optimizer.
Inputs:
- **gradients** (tuple[Tensor]) - The gradients of opt's `params`, the shape is the same as opt's `params`.
Outputs:
Tensor, result of executing optimizer 'opt'.
Raises:
TypeError: If the parameter opt is not an subclass of Optimizer.
ValueError: If the platform is not Ascend graph mode, or customer doesn't switch on TFT feature.
Supported Platforms:
``Ascend``
Examples:
>>> import mindspore as ms
>>> from mindspore import nn
>>>
>>> # Define the network structure of LeNet5. Refer to
>>> # https://gitee.com/mindspore/docs/blob/r2.4.0/docs/mindspore/code/lenet.py
>>> net = LeNet5()
>>> #1) All parameters use the same learning rate and weight decay
>>> optim = nn.SGD(params=net.trainable_params())
>>> optim_wrapper = nn.OptTFTWrapper(optim)
>>>
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
>>> model = ms.train.Model(net, loss_fn=loss, optimizer=optim)
"""
def __init__(self, opt, **kwargs):
super(OptTFTWrapper, self).__init__(opt.learning_rate, opt._parameters) # pylint: disable=W0212
if not isinstance(opt, Optimizer):
raise TypeError(f"For 'OptTFTWrapper', the argument 'opt' must be Optimizer type, " f"but got {type(opt)}.")
tft_env = os.getenv("MS_ENABLE_TFT", "")
if ("TTP:1" not in tft_env) and ("UCE:1" not in tft_env):
raise ValueError("MindIO TFT regitster need custom switch on[MS_ENABLE_TFT='{TTP:1,UCE:1}']!")
mode = context.get_context("mode")
device_target = context.get_context("device_target")
if device_target != "Ascend" or mode != context.GRAPH_MODE:
raise ValueError("MindIO adataper only support on Ascend device with GRAPH Mode!")
self.opt = opt
self.report = TensorReport()
self.depend = ops.Depend()
self.g_one = Tensor([0.1])
# enable consistent check by default, only disable when enable_consistent_check is False
self.use_allreduce = kwargs.get("enable_consistent_check", True)
if self.use_allreduce:
self.allreduce_sum = ops.AllReduce()
self.allreduce_sum.add_prim_attr("tft_report_before", True)
self.param_rank = opt.param_rank
self.optim_filter = opt.optim_filter
self.loss_scale = opt.loss_scale
self.dynamic_weight_decay = opt.dynamic_weight_decay
self.grad_centralization = opt.grad_centralization
self.dynamic_lr = opt.dynamic_lr
self.global_step = opt.global_step
self.is_group = opt.is_group
self.is_group_lr = opt.is_group_lr
self.is_group_params_ordered = opt.is_group_params_ordered
self.use_parallel = opt.use_parallel
if self.is_group:
self.group_params = opt.group_params
self.group_lr = opt.group_lr
self.group_weight_decay = opt.group_weight_decay
self.group_grad_centralization = opt.group_grad_centralization
self.grad_centralization_flags = opt.grad_centralization_flags
self.skip_auto_parallel_compile = opt.skip_auto_parallel_compile
self.learning_rate = opt.learning_rate
self.parameters = opt.parameters
self.decay_flags = opt.decay_flags
self.dynamic_decay_flags = opt.dynamic_decay_flags
self.weight_decay = opt.weight_decay
self.exec_weight_decay = opt.exec_weight_decay
self.ps_parameters = opt.ps_parameters
self.cache_enable = opt.cache_enable
self.reciprocal_scale = opt.reciprocal_scale
self.need_scale = opt.need_scale
self.global_step_increase_tensor = opt.global_step_increase_tensor
self.param_length = opt.param_length
self.enable_tuple_broaden = opt.enable_tuple_broaden
def construct(self, gradients):
g_one = self.depend(self.g_one, gradients)
if self.use_allreduce is True:
g_one_res = self.allreduce_sum(g_one)
else:
g_one_res = g_one
self.report("tft_report", g_one_res)
return self.opt(gradients)