Source code for mindspore.train.callback._backup_and_restore

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""backup and restore related classes and functions."""
from __future__ import absolute_import

import os
import stat

from mindspore import log as logger
from mindspore.train.serialization import load_checkpoint, save_checkpoint
from mindspore.train.callback._callback import Callback
from mindspore.train._utils import _make_directory
from mindspore import _checkparam as Validator


[docs]class BackupAndRestore(Callback): """ Callback to back up and restore the parameters during training. Note: This function can only use in training. Args: backup_dir (str): Path to store and load the checkpoint file. save_freq(Union['epoch', int]): When set to 'epoch' the callback saves the checkpoint at the end of each epoch. When set to an integer, the callback saves the checkpoint every `save_freq` epoch. Default: ``"epoch"`` . delete_checkpoint(bool): If `delete_checkpoint=True`, the checkpoint will be deleted after training is finished. Default: ``True`` . Raises: ValueError: If backup_dir is not str. ValueError: If save_freq is not 'epoch' or int. ValueError: If delete_checkpoint is not bool. Examples: >>> from mindspore import nn >>> from mindspore.train import Model, BackupAndRestore, RunContext >>> >>> # Define the network structure of LeNet5. Refer to >>> # https://gitee.com/mindspore/docs/blob/r2.1/docs/mindspore/code/lenet.py >>> net = LeNet5() >>> loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') >>> optim = nn.Momentum(net.trainable_params(), 0.01, 0.9) >>> model = Model(net, loss_fn=loss, optimizer=optim) >>> # Create the dataset taking MNIST as an example. Refer to >>> # https://gitee.com/mindspore/docs/blob/r2.1/docs/mindspore/code/mnist.py >>> dataset = create_dataset() >>> backup_ckpt = BackupAndRestore("backup") >>> cb_params = {} >>> cb_params["cur_epoch_num"] = 4 >>> cb_params["epoch_num"] = 4 >>> cb_params["cur_step_num"] = 2 >>> cb_params["batch_num"] = 2 >>> cb_params["net_outputs"] = Tensor(2.0) >>> run_context = RunContext(cb_params) >>> backup_ckpt.on_train_begin(run_context) >>> backup_ckpt.on_train_epoch_end(run_context) >>> backup_ckpt.on_train_end(run_context) >>> model.train(10, dataset, callbacks=backup_ckpt) """ def __init__(self, backup_dir, save_freq="epoch", delete_checkpoint=True): super(BackupAndRestore, self).__init__() ckpt_dir = _make_directory(backup_dir) self.backup_file = os.path.join(ckpt_dir, 'backup.ckpt') if save_freq != "epoch": self.save_freq = Validator.check_positive_int(save_freq) else: self.save_freq = 1 self.delete_checkpoint = Validator.check_bool(delete_checkpoint)
[docs] def on_train_begin(self, run_context): """ Load the backup checkpoint file at the beginning of epoch. Args: run_context (RunContext): Context of the process running. For more details, please refer to :class:`mindspore.train.RunContext`. """ if os.path.exists(self.backup_file): cb_params = run_context.original_args() train_net = cb_params.train_network logger.info("Restore checkpoint file is {}, load checkpoint into train net.".format(self.backup_file)) load_checkpoint(self.backup_file, net=train_net)
[docs] def on_train_epoch_end(self, run_context): """ Backup checkpoint file at the end of train epoch. Args: run_context (RunContext): Context of the process running. For more details, please refer to :class:`mindspore.train.RunContext`. """ cb_params = run_context.original_args() cur_epoch_num = cb_params.cur_epoch_num if cur_epoch_num % self.save_freq == 0: train_net = cb_params.train_network logger.info("Train task end, backup checkpoint file: {}.".format(self.backup_file)) save_checkpoint(train_net, self.backup_file)
[docs] def on_train_end(self, run_context): """ Deleted checkpoint file at the end of train. Args: run_context (RunContext): Context of the process running. For more details, please refer to :class:`mindspore.train.RunContext`. """ run_context.original_args() cb_params = run_context.original_args() cur_epoch_num = cb_params.cur_epoch_num if self.delete_checkpoint: logger.info("Delete restore checkpoint file {} at {} epoch.".format(self.backup_file, cur_epoch_num)) os.chmod(self.backup_file, stat.S_IWRITE) os.remove(self.backup_file)