Source code for mindspore.train.callback._early_stop

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""ReduceLROnPlateau Callback class."""
from __future__ import absolute_import
from __future__ import division

import copy
import numpy as np

from mindspore import ops, nn
from mindspore.common.tensor import Tensor
from mindspore import _checkparam as Validator
from mindspore.train.serialization import load_param_into_net
from mindspore import log as logger
from mindspore.ops import ReduceOp
from mindspore.communication import get_group_size
from mindspore.context import ParallelMode
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.train.callback._callback import Callback, _handle_loss


_smaller_better_metrics = ['hausdorff_distance', 'mae', 'mse', 'loss', 'perplexity',
                           'mean_surface_distance', 'root_mean_square_distance', 'eval_loss']


[docs]class EarlyStopping(Callback): """ Stop training when a monitored metric has stopped improving. Assuming `monitor` is "accuracy", with this, `mode` would be "max" since goal of trianing is to maximize the accuracy, the `model.fit()` training loop will check at end of epoch whether the accuracy is no longer increasing, considering the `min_delta` and `patience` if applicable. Once it's found no longer increasing, `run_context.request_stop()` will be called and the training terminates. Args: monitor (str): quantity to be monitored. If evaluation is performed on the end of train epochs, the valid monitors can be "loss", "eval_loss" or metric names passed when instantiate the `Model`; otherwise the valid monitor is "loss". When monitor is "loss", if train network has multiple outputs, the first element will be returned as training loss. Default: ``'eval_loss'`` . patience (int): `monitor` value is better than history best value over `min_delta` is seen as improvement, `patience` is number of epochs with no improvement that would be waited. When the waiting counter `self.wait` is larger than or equal to `patience`, the training process will be stopped. Default: ``0`` . verbose (bool): If False: quiet, if True: print related information. Default: ``False`` . mode (str): one of `{'auto', 'min', 'max'}`. In "min" mode, the learning rate will be reduced when the quantity monitored has stopped decreasing; in "max" mode it will be reduced when the quantity monitored has stopped increasing; in "auto" mode, the direction is automatically inferred from the name of the monitored quantity. Default: ``'auto'`` . min_delta (float): threshold for measuring the new optimum, to only focus on significant changes. Default: ``0`` . baseline (float): Baseline value for the monitor. When the monitor value shows improvement over the history best value and the baseline, the internal wait counter will be set to zero. Default: ``None`` . restore_best_weights (bool): Whether to restore model weights from the epoch with the best value of the monitored quantity. If False, the model weights obtained at the last step of training are used. Default: ``False`` . Raises: ValueError: `mode` not in 'auto', 'min' or 'max'. ValueError: The monitor value is not a scalar. Examples: >>> from mindspore import nn >>> from mindspore.train import Model, EarlyStopping >>> # Define the network structure of LeNet5. Refer to >>> # https://gitee.com/mindspore/docs/blob/r2.3.0rc2/docs/mindspore/code/lenet.py >>> net = LeNet5() >>> loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') >>> optim = nn.Momentum(net.trainable_params(), 0.01, 0.9) >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics={"acc"}) >>> # Create the dataset taking MNIST as an example. Refer to >>> # https://gitee.com/mindspore/docs/blob/r2.3.0rc2/docs/mindspore/code/mnist.py >>> dataset = create_dataset() >>> cb = EarlyStopping(monitor="acc", patience=3, verbose=True) >>> model.fit(10, dataset, callbacks=cb) """ def __init__(self, monitor='eval_loss', min_delta=0, patience=0, verbose=False, mode='auto', baseline=None, restore_best_weights=False): super(EarlyStopping, self).__init__() self.monitor = Validator.check_value_type('monitor', monitor, str) min_delta = Validator.check_value_type("min_delta", min_delta, [float, int]) self.min_delta = abs(min_delta) self.patience = Validator.check_non_negative_int(patience) self.verbose = Validator.check_bool(verbose) self.mode = Validator.check_value_type('mode', mode, str) self.baseline = Validator.check_value_type("min_delta", min_delta, [float, int]) if baseline else None self.restore_best_weights = Validator.check_bool(restore_best_weights) self.wait = 0 self.stopped_epoch = 0 self.best_weights_param_dict = None self._reduce = ValueReduce() if self.mode not in ['auto', 'min', 'max']: raise ValueError("mode should be 'auto', 'min' or 'max', but got %s." % self.mode) if self.mode == 'min' or (self.mode == 'auto' and self.monitor in _smaller_better_metrics): self.is_improvement = lambda a, b: np.less(a, b-self.min_delta) self.best = np.Inf else: self.is_improvement = lambda a, b: np.greater(a, b+self.min_delta) self.best = -np.Inf
[docs] def on_train_begin(self, run_context): """ Initialize variables at the begin of training. Args: run_context (RunContext): Context information of the model. For more details, please refer to :class:`mindspore.train.RunContext`. """ self.wait = 0 self.stopped_epoch = 0 if self.mode == 'min' or (self.mode == 'auto' and self.monitor in _smaller_better_metrics): self.best = np.Inf else: self.best = -np.Inf self.best_weights_param_dict = None
[docs] def on_train_epoch_end(self, run_context): """ monitors the training process and if no improvement is seen for a 'patience' number of epochs, the training process will be stopped. Args: run_context (RunContext): Context information of the model. For more details, please refer to :class:`mindspore.train.RunContext`. """ cb_params = run_context.original_args() cur_epoch = cb_params.get("cur_epoch_num") current_value = self._get_monitor_value(cb_params) parallel_mode = auto_parallel_context().get_parallel_mode() rank_size = 1 if parallel_mode == ParallelMode.STAND_ALONE else get_group_size() if rank_size == 1: current = current_value else: current = self._reduce(Tensor(current_value.astype(np.float32))) / rank_size if current is None: return if self.restore_best_weights and self.best_weights_param_dict is None: self.best_weights_param_dict = copy.deepcopy(cb_params.train_network.parameters_dict()) self.wait += 1 if self.is_improvement(current, self.best): self.best = current if self.restore_best_weights: self.best_weights_param_dict = copy.deepcopy(cb_params.train_network.parameters_dict()) if self.baseline is None or self.is_improvement(current, self.baseline): self.wait = 0 if self.wait >= self.patience: self.stopped_epoch = cur_epoch run_context.request_stop() if self.restore_best_weights and self.best_weights_param_dict is not None: if self.verbose: print('Restoring model weights from the end of the best epoch.') load_param_into_net(cb_params.train_network, self.best_weights_param_dict)
[docs] def on_train_end(self, run_context): """ If verbose is True, print the stopped epoch. Args: run_context (RunContext): Context information of the model. For more details, please refer to :class:`mindspore.train.RunContext`. """ if self.stopped_epoch > 0 and self.verbose: print('Epoch %05d: early stopping' % (self.stopped_epoch))
def _get_monitor_value(self, cb_params): """ Get the monitor value at the end of epoch during training. If `mindspore.train.callback.ReduceLROnPlateau` used with `model.train`, no evaluation process during training, only monitor="loss" is valid; if it used with `model.fit`, evaluation process will be performed at the end of epoch, valid monitor is "loss", "eval_loss" and metrics passed to `Model`. Args: cb_params (dict): A dictionary stores context information of the model. For more details, please refer to :class:`mindspore.train.RunContext`. """ monitor_candidates = {} if self.monitor == "loss": loss = cb_params.get("net_outputs") monitor_value = _handle_loss(loss) if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): logger.warning("Invalid %s.", self.monitor) else: monitor_candidates = cb_params.get("eval_results", {}) monitor_value = monitor_candidates.get(self.monitor) if monitor_value is None: support_keys = set(["loss"] + list(monitor_candidates.keys())) logger.warning('Early stopping is conditioned on %s, ' 'which is not available. Available choices are: %s', self.monitor, support_keys) if isinstance(monitor_value, np.ndarray) and monitor_value.shape != (): raise ValueError("EarlyStopping only supports scalar monitor now.") return np.array(monitor_value) if monitor_value else None
class ValueReduce(nn.Cell): """ Reduces the tensor data across all devices, all devices will get the same final result. For more details, please refer to :class:`mindspore.ops.AllReduce`. """ def __init__(self): super(ValueReduce, self).__init__() self.allreduce = ops.AllReduce(ReduceOp.SUM) def construct(self, x): return self.allreduce(x).asnumpy()