sponge.optimizer.steepest 源代码

"""
Optimizer used to get the minimum value of a given function.
"""

from typing import Union, List, Iterable
from mindspore import Parameter, Tensor
from mindspore.nn.optim.optimizer import Optimizer, opt_init_args_register
from mindspore.ops import functional as F
from mindspore.ops import composite as C
try:
    # MindSpore 2.X
    from mindspore import jit
except ImportError:
    # MindSpore 1.X
    from mindspore import ms_function as jit
from mindspore.nn.learning_rate_schedule import LearningRateSchedule
from mindspore import _checkparam as validator


_gd_opt = C.MultitypeFuncGraph("sd_opt")


@_gd_opt.register("Tensor", "Tensor", "Tensor")
def _gradient_descent(learning_rate, gradient, weight):
    r"""
    Apply sgd optimizer to the weight parameter using Tensor.

    Args:
        learning_rate (Tensor): The learning rate value.
        gradient (Tensor): The gradient of the weight parameter.
        weight (Tensor): The weight parameter.

    Returns:
        bool, whether the operation is successful.
    """
    success = True
    success = F.depend(success, F.assign_add(weight, -gradient * learning_rate))
    return success


@_gd_opt.register("Tensor", "Float32", "Tensor", "Tensor")
def _gradient_descent_with_shift(learning_rate, shift, gradient, weight):
    r"""
    Apply sgd optimizer to the weight parameter using Tensor.

    Args:
        learning_rate (Tensor): The learning rate value.
        shift (float): The shift value.
        gradient (Tensor): The gradient of the weight parameter.
        weight (Tensor): The weight parameter.

    Returns:
        bool, whether the operation is successful."""
    success = True
    origin_shift = -gradient * learning_rate
    success = F.depend(success, F.assign_add(weight, origin_shift.clip(-shift, shift)))
    return success


[文档]class SteepestDescent(Optimizer): """ Implements the steepest descent (gradient descent) algorithm. Note: If parameters are not grouped, the `weight_decay` in optimizer will be applied on the network parameters without 'beta' or 'gamma' in their names. Users can group parameters to change the strategy of decaying weight. When parameters are grouped, each group can set `weight_decay`. If not, the `weight_decay` in optimizer will be applied. Args: params (Union[list[`mindspore.Parameter`], list[dict]]): Must be list of `Parameter` or list of `dict`. When the `params` is a list of `dict`, the string "params", "lr", "grad_centralization" and "order_params" are the keys can be parsed. - params: Required. Parameters in current group. The value must be a list of `Parameter`. - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used. If not, the `learning_rate` in optimizer will be used. Fixed and dynamic learning rate are supported. - weight_decay: Using different `weight_decay` by grouping parameters is currently not supported. - grad_centralization: Optional. Must be Boolean. If "grad_centralization" is in the keys, the set value will be used. If not, the `grad_centralization` is `False` by default. This configuration only works on the convolution layer. - order_params: Optional. When parameters is grouped, this usually is used to maintain the order of parameters that appeared in the network to improve performance. The value should be parameters whose order will be followed in optimizer. If `order_params` in the keys, other keys will be ignored and the element of 'order_params' must be in one group of `params`. learning_rate (Union[float, int, Tensor, Iterable, LearningRateSchedule], optional): - float: The fixed learning rate value. Must be equal to or greater than ``0``. - int: The fixed learning rate value. Must be equal to or greater than ``0``. It will be converted to float. - Tensor: Its value should be a scalar or a 1-D vector. For scalar, fixed learning rate will be applied. For vector, learning rate is dynamic, then the i-th step will take the i-th value as the learning rate. - Iterable: Learning rate is dynamic. The i-th step will take the i-th value as the learning rate. - `mindspore.nn.LearningRateSchedule`: Learning rate is dynamic. During training, the optimizer calls the instance of LearningRateSchedule with step as the input to get the learning rate of current step. weight_decay (Union[float, int], optional): An int or a floating point value for the weight decay. It must be equal to or greater than ``0``. If the type of `weight_decay` input is int, it will be converted to float. Default: ``0.0``. loss_scale (float, optional): A floating point value for the loss scale. It must be greater than ``0``. If the type of `loss_scale` input is int, it will be converted to float. In general, use the default value. Only when `mindspore.amp.FixedLossScaleManager` is used for training and the `drop_overflow_update` in `mindspore.amp.FixedLossScaleManager` is set to ``False``, this value needs to be the same as the `loss_scale` in `mindspore.amp.FixedLossScaleManager`. Refer to class `mindspore.amp.FixedLossScaleManager` for more details. Default: 1.0. max_shift (float, optional): A floating point value for the max shift. It must be greater than ``0``. It is the bound of the shift distance each iteration in the optimizer. If the max shift is set to be None, we will do nothing to the shift. But if max_shift is a given float number, thus the bound of shift would be: [-max_shift, max_shift] Default: ``None``. Inputs: - **gradients** (Tensor) - The gradients of the parameters. Outputs: - **success** (bool) - whether the operation is successful. Raises: TypeError: If `learning_rate` is not one of int, float, Tensor, Iterable, LearningRateSchedule. TypeError: If element of `parameters` is neither Parameter nor dict. TypeError: If `loss_scale` is not a float. TypeError: If `weight_decay` is neither float nor int. ValueError: If `loss_scale` is less than or equal to ``0``. ValueError: If `weight_decay` is less than ``0``. ValueError: If `learning_rate` is a Tensor, but the dimension of tensor is greater than ``1``. Supported Platforms: ``Ascend`` ``GPU`` ``CPU`` Examples: >>> from sponge import Sponge, Molecule, ForceField >>> from sponge.optimizer import SteepestDescent >>> system = Molecule(template='water.tip3p.yaml') >>> potential = ForceField(system, parameters='SPCE') >>> optim = SteepestDescent(params=system.trainable_params(), learning_rate=1e-7) >>> print(system.coordinate.value()) >>> # [[[ 0. 0. 0. ] >>> # [ 0.07907964 0.06120793 0. ] >>> # [-0.07907964 0.06120793 0. ]]] >>> md = Sponge(system, potential, optim) >>> md.run(1000) >>> # [MindSPONGE] Started simulation at 2024-04-29 01:00:42 >>> # [MindSPONGE] Finished simulation at 2024-04-29 01:00:44 >>> # [MindSPONGE] Simulation time: 2.02 seconds. >>> print(system.coordinate.value()) >>> # [[[ 5.3361070e-12 2.3146218e-03 0.0000000e+00] >>> # [ 8.1648827e-02 6.0050689e-02 0.0000000e+00] >>> # [-8.1648827e-02 6.0050689e-02 0.0000000e+00]]] """ @opt_init_args_register def __init__(self, params: Union[List[Parameter], List[dict]], learning_rate: Union[float, int, Tensor, Iterable, LearningRateSchedule] = 1e-03, weight_decay: Union[float, int] = 0.0, loss_scale: float = 1.0, max_shift: float = None ): super().__init__( parameters=params, learning_rate=learning_rate, weight_decay=weight_decay, loss_scale=loss_scale, ) if max_shift is None: self.max_shift = None else: if isinstance(max_shift, int): max_shift = float(max_shift) validator.check_value_type("max_shift", max_shift, [float], self.cls_name) validator.check_positive_float(max_shift, "max_shift", self.cls_name) self.max_shift = max_shift @jit def construct(self, gradients): r""" Update the parameters by the gradients Args: gradients (Tensor): The gradients of the parameters. Returns: bool, whether the operation is successful.""" params = self._parameters gradients = self.flatten_gradients(gradients) gradients = self.gradients_centralization(gradients) gradients = self.scale_grad(gradients) lr = self.get_lr() if self.is_group_lr: if self.max_shift is not None: success = self.hyper_map_reverse( F.partial(_gd_opt), lr, self.max_shift, gradients, params) else: success = self.hyper_map_reverse(F.partial(_gd_opt), lr, gradients, params) elif self.max_shift is not None: success = self.hyper_map_reverse( F.partial(_gd_opt, lr, self.max_shift), gradients, params) else: success = self.hyper_map_reverse(F.partial(_gd_opt, lr), gradients, params) return success