# Copyright 2021-2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Process data and Calc loss landscape."""
from __future__ import absolute_import
import os
import time
import json
import stat
import shutil
import numbers
from collections import defaultdict, namedtuple
from concurrent.futures import wait, ALL_COMPLETED, ProcessPoolExecutor
import numpy as np
from scipy import linalg, sparse
from mindspore import log as logger
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.summary_pb2 import LossLandscape
from mindspore.train.summary import SummaryRecord
from mindspore.train.summary.enums import PluginEnum
from mindspore.train.anf_ir_pb2 import DataType
from mindspore.train._utils import check_value_type, _make_directory
from mindspore.train.dataset_helper import DatasetHelper
from mindspore.train.metrics import get_metrics
from mindspore import context
# if there is no path, you need to set to empty list
Points = namedtuple("Points", ["x", "y", "z"])
def nptype_to_prototype(np_value):
"""
Transform the np type to proto type.
Args:
np_value (Type): Numpy data type.
Returns:
Type, proto data type.
"""
np2pt_tbl = {
np.bool_: 'DT_BOOL',
np.int8: 'DT_INT8',
np.int16: 'DT_INT16',
np.int32: 'DT_INT32',
np.int64: 'DT_INT64',
np.uint8: 'DT_UINT8',
np.uint16: 'DT_UINT16',
np.uint32: 'DT_UINT32',
np.uint64: 'DT_UINT64',
np.float16: 'DT_FLOAT16',
np.float: 'DT_FLOAT64',
np.float32: 'DT_FLOAT32',
np.float64: 'DT_FLOAT64',
None: 'DT_UNDEFINED'
}
if np_value is None:
return None
np_type = np_value.dtype.type
proto = np2pt_tbl.get(np_type, None)
if proto is None:
raise TypeError("No match for proto data type.")
return proto
def fill_array_to_tensor(np_value, summary_tensor):
"""
Package the tensor summary.
Args:
np_value (Type): Summary data type.
summary_tensor (Tensor): The tensor of summary.
Returns:
Summary, return tensor summary content.
"""
# get tensor dtype
tensor_dtype = nptype_to_prototype(np_value)
summary_tensor.data_type = DataType.Value(tensor_dtype)
# get the value list
tensor_value_list = np_value.reshape(-1).tolist()
summary_tensor.float_data.extend(tensor_value_list)
# get the tensor dim
for vector in np_value.shape:
summary_tensor.dims.append(vector)
return summary_tensor
def transfer_tensor_to_tuple(inputs):
"""
If the input is a tensor, convert it to a tuple. If not, the output is unchanged.
"""
if isinstance(inputs, Tensor):
return (inputs,)
return inputs
class Landscape:
"""Return loss landscape."""
def __init__(self,
intervals,
decomposition,
landscape_points: Points,
convergence_point=None,
path_points=None):
self.landscape_points = landscape_points
self.decomposition = decomposition
self.intervals = intervals
self.num_samples = 2048
self.convergence_point = convergence_point
self.path_points = path_points
self.unit = 'step'
self.step_per_epoch = 1
def set_convergence_point(self, convergence_point: Points):
"""Set the convergence point."""
self.convergence_point = convergence_point
def transform_to_loss_landscape_msg(self, landscape_data):
"""Transform to loss landscape_msg."""
landscape_msg = LossLandscape()
# only save one dim in x and y
fill_array_to_tensor(landscape_data.landscape_points.x[0], landscape_msg.landscape.x)
fill_array_to_tensor(landscape_data.landscape_points.y[:, 0], landscape_msg.landscape.y)
fill_array_to_tensor(landscape_data.landscape_points.z, landscape_msg.landscape.z)
if landscape_data.path_points:
landscape_msg.loss_path.intervals.extend(landscape_data.intervals)
fill_array_to_tensor(landscape_data.path_points.x, landscape_msg.loss_path.points.x)
fill_array_to_tensor(landscape_data.path_points.y, landscape_msg.loss_path.points.y)
fill_array_to_tensor(landscape_data.path_points.z, landscape_msg.loss_path.points.z)
if landscape_data.convergence_point:
fill_array_to_tensor(landscape_data.convergence_point.x, landscape_msg.convergence_point.x)
fill_array_to_tensor(landscape_data.convergence_point.y, landscape_msg.convergence_point.y)
fill_array_to_tensor(landscape_data.convergence_point.z, landscape_msg.convergence_point.z)
landscape_msg.metadata.decomposition = landscape_data.decomposition
landscape_msg.metadata.unit = self.unit
landscape_msg.metadata.step_per_epoch = self.step_per_epoch
return landscape_msg
[docs]class SummaryLandscape:
"""
SummaryLandscape can help you to collect loss landscape information.
It can create landscape in PCA direction or random direction by calculating loss.
Note:
SummaryLandscape only supports Linux systems.
Args:
summary_dir (str): The path of summary is used to save the model weight,
metadata and other data required to create landscape.
Examples:
>>> import mindspore as ms
>>> import mindspore.nn as nn
>>> from mindspore.train import Model, Accuracy, Loss
>>> from mindspore import SummaryCollector, SummaryLandscape
>>>
>>> if __name__ == '__main__':
... # If the device_target is Ascend, set the device_target to "Ascend"
... ms.set_context(mode=ms.GRAPH_MODE, device_target="GPU")
... # Create the dataset taking MNIST as an example. Refer to
... # https://gitee.com/mindspore/docs/blob/r2.3.0rc2/docs/mindspore/code/mnist.py
... ds_train = create_dataset()
... # Define the network structure of LeNet5. Refer to
... # https://gitee.com/mindspore/docs/blob/r2.3.0rc2/docs/mindspore/code/lenet.py
... network = LeNet5()
... net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
... net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
... model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
... # Simple usage for collect landscape information:
... interval_1 = [1, 2, 3, 4, 5]
... summary_collector = SummaryCollector(summary_dir='./summary/lenet_interval_1',
... collect_specified_data={'collect_landscape':{"landscape_size": 4,
... "unit": "step",
... "create_landscape":{"train":True,
... "result":False},
... "num_samples": 2048,
... "intervals": [interval_1]}
... })
... model.train(1, ds_train, callbacks=[summary_collector], dataset_sink_mode=False)
...
... # Simple usage for visualization landscape:
... def callback_fn():
... # Define the network structure of LeNet5. Refer to
... # https://gitee.com/mindspore/docs/blob/r2.3.0rc2/docs/mindspore/code/lenet.py
... network = LeNet5()
... net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
... metrics = {"Loss": Loss()}
... model = Model(network, net_loss, metrics=metrics)
... # Create the dataset taking MNIST as an example. Refer to
... # https://gitee.com/mindspore/docs/blob/r2.3.0rc2/docs/mindspore/code/mnist.py
... ds_eval = create_dataset()
... return model, network, ds_eval, metrics
...
... summary_landscape = SummaryLandscape('./summary/lenet_interval_1')
... # parameters of collect_landscape can be modified or unchanged
... summary_landscape.gen_landscapes_with_multi_process(callback_fn,
... collect_landscape={"landscape_size": 4,
... "create_landscape":{"train":False,
... "result":False},
... "num_samples": 2048,
... "intervals": [interval_1]},
... device_ids=[1])
"""
def __init__(self, summary_dir):
self._summary_dir = os.path.realpath(summary_dir)
self._ckpt_dir = os.path.join(self._summary_dir, 'ckpt_dir')
_make_directory(self._ckpt_dir)
# save the model params file, key is epoch, value is the ckpt file path
self._model_params_file_map = {}
self._epoch_group = defaultdict(list)
self._metric_fns = None
def _get_model_params(self, epochs):
"""Get the model params."""
parameters = []
for epoch in epochs:
file_path = self._model_params_file_map.get(str(epoch))
parameters.append(list(load_checkpoint(file_path).values()))
return parameters
def _create_epoch_group(self, intervals):
for i, interval in enumerate(intervals):
for j in interval:
self._epoch_group[i].append(j)
[docs] def clean_ckpt(self):
"""
Clean the checkpoint.
Tutorial Examples:
- `Training Optimization Process Visualization
<https://www.mindspore.cn/mindinsight/docs/en/master/landscape.html>`_
"""
shutil.rmtree(self._ckpt_dir, ignore_errors=True)
[docs] def gen_landscapes_with_multi_process(self, callback_fn, collect_landscape=None,
device_ids=None, output=None):
"""
Use the multi process to generate landscape.
Args:
callback_fn (python function): A python function object. User needs to write a function,
it has no input, and the return requirements are as follows.
- mindspore.train.Model: User's model object.
- mindspore.nn.Cell: User's network object.
- mindspore.dataset: User's dataset object for create loss landscape.
- mindspore.train.Metrics: User's metrics object.
collect_landscape (Union[dict, None]): The meaning of the parameters
when creating loss landscape is consistent with the fields
with the same name in SummaryCollector. The purpose of setting here
is to allow users to freely modify creating parameters. Default: ``None`` .
- landscape_size (int): Specify the image resolution of the generated loss landscape.
For example, if it is set to ``128`` , the resolution of the landscape is 128 * 128.
The calculation time increases with the increase of resolution.
Default: ``40`` . Optional values: between 3 and 256.
- create_landscape (dict): Select how to create loss landscape.
Training process loss landscape(train) and training result loss landscape(result).
Default: ``{"train": True, "result": True}``. Optional: ``True`` / ``False`` .
- num_samples (int): The size of the dataset used to create the loss landscape.
For example, in image dataset, You can set num_samples is 2048,
which means that 2048 images are used to create loss landscape.
Default: ``2048`` .
- intervals (List[List[int]]): Specifies the interval
in which the loss landscape. For example: If the user wants to
create loss landscape of two training processes, they are 1-5 epoch
and 6-10 epoch respectively. They can set [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]].
Note: Each interval have at least three epochs.
device_ids (List(int)): Specifies which devices are used to create loss landscape.
For example: [0, 1] refers to creating loss landscape with device 0 and device 1.
Default: ``None`` .
output (str): Specifies the path to save the loss landscape.
Default: ``None`` . The default save path is the same as the summary file.
"""
executor = None
if len(device_ids) > 1:
executor = ProcessPoolExecutor(len(device_ids))
futures = [executor.submit(self._set_context, i) for i in device_ids]
wait(futures, return_when=ALL_COMPLETED)
output_path = os.path.realpath(output) if output is not None else self._summary_dir
summary_record = SummaryRecord(output_path)
self._check_device_ids(device_ids)
if collect_landscape is not None:
try:
self._check_collect_landscape_data(collect_landscape)
except (ValueError, TypeError) as err:
summary_record.close()
raise err
json_path = os.path.join(self._ckpt_dir, 'train_metadata.json')
if not os.path.exists(json_path):
summary_record.close()
raise FileNotFoundError(f'For "{self.__class__.__name__}", '
f'train_metadata.json file path of {json_path} not exists.')
with open(json_path, 'r') as file:
data = json.load(file)
for key, value in collect_landscape.items():
if key in data.keys():
data[key] = value
if "intervals" in collect_landscape.keys():
self._create_epoch_group(collect_landscape.get("intervals"))
data["epoch_group"] = self._epoch_group
with open(json_path, 'w') as file:
json.dump(data, file)
os.chmod(json_path, stat.S_IRUSR)
for interval, landscape in self._list_landscapes(callback_fn=callback_fn, executor=executor,
device_ids=device_ids):
summary_record.add_value(PluginEnum.LANDSCAPE.value, f'landscape_{str(interval)}', landscape)
summary_record.record(0)
summary_record.flush()
summary_record.close()
def _list_landscapes(self, callback_fn, executor=None, device_ids=None):
"""Create landscape with single device and list all landscape."""
if not os.path.exists(os.path.join(self._ckpt_dir, 'train_metadata.json')):
raise FileNotFoundError(f'For "{self.__class__.__name__}", train_metadata.json file does not exist '
f'under the path, please use summary_collector to collect information to '
f'create the json file')
with open(os.path.join(self._ckpt_dir, 'train_metadata.json'), 'r') as file:
data = json.load(file)
self._check_json_file_data(data)
self._epoch_group = data['epoch_group']
self._model_params_file_map = data['model_params_file_map']
kwargs = dict(proz=0.2, landscape_size=data['landscape_size'], device_ids=device_ids, callback_fn=callback_fn)
start = time.time()
kwargs['executor'] = executor
if data['create_landscape']['train']:
for i, epochs in enumerate(self._epoch_group.values()):
self._log_message(data['create_landscape'], index=i, interval=epochs)
kwargs['epochs'] = epochs
mid_time = time.time()
landscape_data = self._create_landscape_by_pca(**kwargs)
logger.info("Create landscape end, use time: %s s." % (round(time.time() - mid_time, 6)))
landscape_data.unit = data['unit']
landscape_data.step_per_epoch = data['step_per_epoch']
landscape_data.num_samples = data['num_samples']
yield [epochs[0], epochs[-1]], landscape_data.transform_to_loss_landscape_msg(landscape_data)
if data['create_landscape']['result']:
final_epochs = [list(self._epoch_group.values())[-1][-1]]
self._log_message(data['create_landscape'], final_epochs=final_epochs)
kwargs['epochs'] = final_epochs
mid_time = time.time()
landscape_data = self._create_landscape_by_random(**kwargs)
logger.info("Create landscape end, use time: %s s." % (round(time.time() - mid_time, 6)))
landscape_data.unit = data['unit']
landscape_data.step_per_epoch = data['step_per_epoch']
landscape_data.num_samples = data['num_samples']
yield final_epochs, landscape_data.transform_to_loss_landscape_msg(landscape_data)
logger.info("Total use time: %s s." % (round(time.time() - start, 6)))
def _log_message(self, create_landscape, index=None, interval=None, final_epochs=None):
"""Generate drawing information using log."""
if final_epochs is None:
if create_landscape['result']:
msg = f"Start to create the {index + 1}/{len(self._epoch_group) + 1} landscapes, " \
f"checkpoint is {interval}, decomposition is PCA."
else:
msg = f"Start to create the {index + 1}/{len(self._epoch_group)} landscapes, " \
f"checkpoint is {interval}, decomposition is PCA."
else:
if create_landscape['train']:
msg = f"Start to create the {len(self._epoch_group) + 1}/{len(self._epoch_group) + 1} landscapes, " \
f"checkpoint is {final_epochs}, decomposition is Random. "
else:
msg = f"Start to create the {1}/{1} landscapes, " \
f"checkpoint is {final_epochs}, decomposition is Random."
logger.info(msg)
@staticmethod
def _set_context(device_id):
"""Set context."""
context.set_context(device_id=device_id)
context.set_context(mode=context.GRAPH_MODE)
def _create_landscape_by_pca(self, epochs, proz, landscape_size, device_ids=None, callback_fn=None, executor=None):
"""Create landscape by PCA."""
multi_parameters = self._get_model_params(epochs)
param_matrixs = []
for parameters in multi_parameters:
parlis = []
for param in parameters:
if ("weight" in param.name or "bias" in param.name) and ("moment" not in param.name):
data = param.data.asnumpy()
parlis = np.concatenate((parlis, data), axis=None)
else:
continue
param_matrixs.append(parlis)
param_matrixs = np.vstack(param_matrixs)
param_matrixs = param_matrixs[:-1] - param_matrixs[-1]
# Only 2 are needed, as we have to reduce high dimensions into 2D.And we reserve one for loss value.
pca = _PCA(n_comps=2)
principal_components = pca.compute(param_matrixs.T)
v_ori, w_ori = np.array(principal_components[:, 0]), np.array(principal_components[:, -1])
final_params = list(multi_parameters[-1])
# Reshape PCA directions(include dimensions of all parameters) into original shape of Model parameters
v_ndarray = self._reshape_vector(v_ori, final_params)
w_ndarray = self._reshape_vector(w_ori, final_params)
# Reshape PCA directions(include dimensions of only weights) into original shape of Model parameters
final_params_filtered = self._filter_weight_and_bias(final_params)
v_ndarray_filtered = self._reshape_vector(v_ori, final_params_filtered)
w_ndarray_filtered = self._reshape_vector(w_ori, final_params_filtered)
v_ndarray, w_ndarray = self._normalize_vector(final_params, v_ndarray, w_ndarray)
v_ndarray_filtered, w_ndarray_filtered = self._normalize_vector(final_params_filtered, v_ndarray_filtered,
w_ndarray_filtered)
# Flat to a single vector and calc alpha, beta
v_param = self._flat_ndarray(v_ndarray_filtered)
w_param = self._flat_ndarray(w_ndarray_filtered)
final_params_numpy = [param.data.asnumpy() for param in final_params]
final_params_filtered_numpy = [param.data.asnumpy() for param in final_params_filtered]
coefs = self._calc_coefs(multi_parameters, final_params_filtered_numpy, v_param, w_param)
# generate coordinates of loss landscape
coefs_x = coefs[:, 0][np.newaxis]
coefs_y = coefs[:, 1][np.newaxis]
x_axis = np.linspace(min(coefs_x[0]) - proz * (max(coefs_x[0]) - min(coefs_x[0])),
max(coefs_x[0]) + proz * (max(coefs_x[0]) - min(coefs_x[0])), landscape_size)
y_axis = np.linspace(min(coefs_y[0]) - proz * (max(coefs_y[0]) - min(coefs_y[0])),
max(coefs_y[0]) + proz * (max(coefs_y[0]) - min(coefs_y[0])), landscape_size)
x_points, y_points = np.meshgrid(x_axis, y_axis)
test_final_params = dict()
for param in final_params:
test_final_params[param.name] = param.data.asnumpy()
if executor is not None:
coefs_parts, y_points_parts = [], []
count_per_parts = len(coefs) // len(device_ids)
start = 0
for i in range(len(device_ids)):
if i != len(device_ids) - 1:
coefs_parts.append(coefs[start:start + count_per_parts])
start = start + count_per_parts
else:
coefs_parts.append(coefs[start:])
count_per_parts = len(y_points) // len(device_ids)
start = 0
logger.info("Use multi process, device_id: %s." % (device_ids))
for i in range(len(device_ids)):
if i != len(device_ids) - 1:
y_points_parts.append(y_points[start:start + count_per_parts])
start = start + count_per_parts
else:
y_points_parts.append(y_points[start:])
futures = []
for i, _ in enumerate(device_ids):
future = executor.submit(self._cont_loss_wrapper, callback_fn, test_final_params, final_params_numpy,
v_ndarray, w_ndarray, x_points, y_points_parts[i], coefs=coefs_parts[i])
futures.append(future)
wait(futures, return_when=ALL_COMPLETED)
z_points, paths = [], []
for future in futures:
paths += future.result()[0]
z_points += future.result()[1]
else:
paths, z_points = self._cont_loss_wrapper(callback_fn, test_final_params, final_params_numpy,
v_ndarray, w_ndarray, x_points, y_points, coefs=coefs)
paths = np.array(paths)
landscape_points = Points(x_points, y_points, np.vstack(z_points))
path_points = Points(coefs_x[0], coefs_y[0], paths.T[0])
zero_index = int(np.argwhere(path_points.x == 0))
convergence_point = Points(np.array([0]), np.array([0]), np.array([path_points.z[zero_index]]))
landscape = Landscape(intervals=epochs, decomposition='PCA', landscape_points=landscape_points,
path_points=path_points, convergence_point=convergence_point)
return landscape
def _cont_loss_wrapper(self, callback_fn, test_final_params, final_params_numpy,
v_ndarray, w_ndarray, x_points, y_points, coefs=None):
"""Compute loss wrapper."""
model, network, valid_dataset, metrics = callback_fn()
with open(os.path.join(self._ckpt_dir, 'train_metadata.json'), 'r') as file:
data = json.load(file)
self._check_json_file_data(data)
num_samples = data['num_samples']
batch_size = valid_dataset.get_batch_size()
num_batches = num_samples // batch_size
valid_dataset = valid_dataset.take(num_batches)
paths, final_params = [], []
for (key, value) in test_final_params.items():
parameter = Parameter(Tensor(value), name=key, requires_grad=True)
final_params.append(parameter)
if coefs is not None:
for i, coef in enumerate(coefs):
loss_data = self._cont_loss(valid_dataset, network, model, metrics, final_params,
final_params_numpy, [coef[0]], coef[1], v_ndarray, w_ndarray, path=True)
paths.append(loss_data)
print("Drawing landscape path total progress is %s/%s, landscape path loss is %s."
% (i+1, len(coefs), loss_data[0]))
# Start to calc loss landscape
z_points = list()
# Compute loss landscape
for i, _ in enumerate(y_points):
print("Drawing landscape total progress: %s/%s." % (i+1, len(y_points)))
vals = self._cont_loss(valid_dataset, network, model, metrics, final_params,
final_params_numpy, x_points[i], y_points[i][0],
v_ndarray, w_ndarray)
z_points.append(vals)
return paths, z_points
def _create_landscape_by_random(self, epochs, proz, landscape_size, device_ids=None,
callback_fn=None, executor=None):
"""Create landscape by Random."""
multi_parameters = self._get_model_params(epochs)
final_params = list(multi_parameters[-1])
final_params_numpy = [param.data.asnumpy() for param in final_params]
total_params = sum(np.size(p) for p in final_params_numpy)
v_rand = np.random.normal(size=total_params)
w_rand = np.random.normal(size=total_params)
# Reshape Random directions(include dimensions of all parameters) into original shape of Model parameters
v_ndarray = self._reshape_random_vector(v_rand, final_params_numpy)
w_ndarray = self._reshape_random_vector(w_rand, final_params_numpy)
v_ndarray, w_ndarray = self._normalize_vector(final_params, v_ndarray, w_ndarray)
boundaries_x, boundaries_y = 5, 5
x_axis = np.linspace(-proz * boundaries_x, proz * boundaries_x, landscape_size)
y_axis = np.linspace(-proz * boundaries_y, proz * boundaries_y, landscape_size)
x_points, y_points = np.meshgrid(x_axis, y_axis)
test_final_params = dict()
for param in final_params:
test_final_params[param.name] = param.data.asnumpy()
if executor is not None:
logger.info("Use multi process, device_id: %s." % (device_ids))
y_points_parts = []
count_per_parts = len(y_points) // len(device_ids)
start = 0
for i in range(len(device_ids)):
if i != len(device_ids) - 1:
y_points_parts.append(y_points[start:start + count_per_parts])
start = start + count_per_parts
else:
y_points_parts.append(y_points[start:])
futures = []
for i in range(len(device_ids)):
future = executor.submit(self._cont_loss_wrapper, callback_fn, test_final_params, final_params_numpy,
v_ndarray, w_ndarray, x_points, y_points_parts[i])
futures.append(future)
wait(futures, return_when=ALL_COMPLETED)
z_points = []
for future in futures:
z_points += future.result()[1]
else:
_, z_points = self._cont_loss_wrapper(callback_fn, test_final_params, final_params_numpy,
v_ndarray, w_ndarray, x_points, y_points)
landscape_points = Points(x_points, y_points, np.vstack(z_points))
convergence_point = Points(np.array([x_axis[len(x_axis)//2]]), np.array([y_axis[len(y_axis)//2]]),
np.array([z_points[len(x_axis)//2][len(y_axis)//2]]))
landscape = Landscape(intervals=epochs, decomposition='Random', landscape_points=landscape_points,
convergence_point=convergence_point)
return landscape
@staticmethod
def _filter_weight_and_bias(parameters):
"""Filter the weight and bias of parameters."""
filter_params = []
for param in parameters:
if ('weight' not in param.name and 'bias' not in param.name) or ('moment' in param.name):
continue
filter_params.append(param)
return filter_params
@staticmethod
def _reshape_vector(vector, parameters):
"""Reshape vector into model shape."""
ndarray = list()
index = 0
for param in parameters:
data = param.data.asnumpy()
if ("weight" not in param.name and "bias" not in param.name) or ("moment" in param.name):
ndarray.append(np.array(data, dtype=np.float32))
continue
vec_it = vector[index:(index + data.size)].reshape(data.shape)
ndarray.append(np.array(vec_it, dtype=np.float32))
index += data.size
return ndarray
@staticmethod
def _reshape_random_vector(vector, params_numpy):
""" Reshape random vector into model shape."""
ndarray = list()
index = 0
for param in params_numpy:
len_p = np.size(param)
p_size = np.shape(param)
vec_it = vector[index:(index + len_p)].reshape(p_size)
ndarray.append(np.array(vec_it, dtype=np.float32))
index += len_p
return ndarray
@staticmethod
def _normalize_vector(parameters, get_v, get_w):
"""
Normalizes the vectors spanning the 2D space, to make trajectories comparable between each other.
"""
for i, param in enumerate(parameters):
# Here as MindSpore ckpt has hyperparameters, we should skip them to make sure
# PCA calculation is correct.
data = param.data.asnumpy()
if ("weight" in param.name or "bias" in param.name) and ("moment" not in param.name):
factor_v = np.linalg.norm(data) / np.linalg.norm(get_v[i])
factor_w = np.linalg.norm(data) / np.linalg.norm(get_w[i])
get_v[i] = get_v[i] * factor_v
get_w[i] = get_w[i] * factor_w
else:
get_v[i] = get_v[i] * 0
get_w[i] = get_w[i] * 0
return get_v, get_w
@staticmethod
def _flat_ndarray(ndarray_vector):
"""Concatenates a python array of numpy arrays into a single, flat numpy array."""
return np.concatenate([item.flatten() for item in ndarray_vector], axis=None)
def _calc_coefs(self, parameter_group, final_param_ndarray, v_vector, w_vector):
"""
Calculates the scale factors for plotting points
in the 2D space spanned by the vectors v and w.
"""
matris = [v_vector, w_vector]
matris = np.vstack(matris)
matris = matris.T
pas = self._flat_ndarray(final_param_ndarray)
coefs = list()
for parameters in parameter_group:
testi = list()
for param in parameters:
# Here as MindSpore ckpt has hyperparameters,
# we should skip them to make sure PCA calculation is correct
if ('weight' not in param.name and 'bias' not in param.name) or ('moment' in param.name):
continue
testi.append(param.data.asnumpy())
st_vec = self._flat_ndarray(testi)
b_vec = st_vec - pas
# Here using least square method to get solutions of a equation system to generate alpha and beta.
coefs.append(np.hstack(np.linalg.lstsq(matris, b_vec, rcond=None)[0]))
return np.array(coefs)
def _cont_loss(self, ds_eval, network, model, metrics, parameters,
final_params_numpy, alph, beta, get_v, get_w, path=False):
"""
Calculates the loss landscape based on vectors v and w (which can be principal components).
Changes the internal state of model. Executes model.
"""
logger.info("start to cont loss")
vals = list()
al_item = 0
for i, _ in enumerate(alph):
# calculate new parameters for model
parameters_dict = dict()
for j, param in enumerate(parameters):
parameters_dict[param.name] = self._change_parameter(j, param, final_params_numpy,
alph[al_item], beta,
get_v, get_w)
al_item += 1
# load parameters into model and calculate loss
load_param_into_net(network, parameters_dict)
del parameters_dict
loss = self._loss_compute(model, ds_eval, metrics)
if path is False:
print("Current local landscape progress is %s/%s, landscape loss is %s."
% (i+1, len(alph), loss.get('Loss')))
vals = np.append(vals, loss.get('Loss'))
return vals
@staticmethod
def _change_parameter(index, parameter, final_params_numpy, alpha, beta, get_v, get_w):
"""Function for changing parameter value with map and lambda."""
data = final_params_numpy[index]
data_target = data + alpha * get_v[index] + beta * get_w[index]
data_target = Tensor(data_target.astype(np.float32))
parameter.set_data(Tensor(data_target))
return parameter
def _loss_compute(self, model, data, metrics):
"""Compute loss."""
dataset_sink_mode = False
self._metric_fns = get_metrics(metrics)
for metric in self._metric_fns.values():
metric.clear()
network = model.train_network
dataset_helper = DatasetHelper(data, dataset_sink_mode)
network.set_train(True)
network.phase = 'train'
for inputs in dataset_helper:
inputs = transfer_tensor_to_tuple(inputs)
outputs = network(*inputs)
self._update_metrics(outputs)
metrics = self._get_metrics()
return metrics
def _update_metrics(self, outputs):
"""Update metrics local values."""
if isinstance(outputs, Tensor):
outputs = (outputs,)
if not isinstance(outputs, tuple):
raise ValueError(f"The argument 'outputs' should be tuple, but got {type(outputs)}. "
f"Modify 'output' to Tensor or tuple. ")
for metric in self._metric_fns.values():
metric.update(outputs[0])
def _get_metrics(self):
"""Get metrics local values."""
metrics = dict()
for key, value in self._metric_fns.items():
metrics[key] = value.eval()
return metrics
def _check_unit(self, unit):
"""Check unit type and value."""
check_value_type('unit', unit, str)
if unit not in ["step", "epoch"]:
raise ValueError(f'For "{self.__class__.__name__}", the "unit" in train_metadata.json should be '
f'step or epoch, but got the: {unit}')
def _check_landscape_size(self, landscape_size):
"""Check landscape size type and value."""
check_value_type('landscape_size', landscape_size, int)
# landscape size should be between 3 and 256.
if landscape_size < 3 or landscape_size > 256:
raise ValueError(f'For "{self.__class__.__name__}", "landscape_size" in train_metadata.json should be '
f'between 3 and 256, but got the: {landscape_size}')
def _check_create_landscape(self, create_landscape):
"""Check create landscape type and value."""
check_value_type('create_landscape', create_landscape, dict)
for param, value in create_landscape.items():
if param not in ["train", "result"]:
raise ValueError(f'For "{self.__class__.__name__}", the key of "create_landscape" should be in '
f'["train", "result"], but got the: {param}.')
if len(create_landscape) < 2:
raise ValueError(f'For "{self.__class__.__name__}", the key of "create_landscape" should be train '
f'and result, but only got the: {param}')
check_value_type(param, value, bool)
def _check_intervals(self, intervals):
"""Check intervals type and value."""
check_value_type('intervals', intervals, list)
for _, interval in enumerate(intervals):
check_value_type('each interval in intervals', interval, list)
#Each interval have at least three epochs.
if len(interval) < 3:
raise ValueError(f'For "{self.__class__.__name__}", the length of each list in "intervals" '
f'should not be less than three, but got the: {interval}.')
for j in interval:
if not isinstance(j, int):
raise TypeError(f'For "{self.__class__.__name__}", the type of each value in "intervals" '
f'should be int, but got the: {type(j)}.')
def _check_device_ids(self, device_ids):
"""Check device_ids type and value."""
check_value_type('device_ids', device_ids, list)
for i in device_ids:
if not isinstance(i, int):
raise TypeError(f'For "{self.__class__.__name__}.gen_landscapes_with_multi_process", the parameter '
f'"device_ids" type should be int, but got the: {type(i)}.')
#device_id should be between 0 and 7.
if i < 0 or i > 7:
raise ValueError(f'For "{self.__class__.__name__}.gen_landscapes_with_multi_process", the parameter '
f'"device_ids" should be between 0 and 7, but got {i}.')
def _check_collect_landscape_data(self, collect_landscape):
"""Check collect landscape data type and value."""
for param in collect_landscape.keys():
if param not in ["landscape_size", "unit", "num_samples", "create_landscape", "intervals"]:
raise ValueError(f'For "{self.__class__.__name__}", the key of collect landscape should be '
f'landscape_size, unit, num_samples create_landscape or intervals, '
f'but got the: {param}. ')
if "landscape_size" in collect_landscape:
landscape_size = collect_landscape.get("landscape_size")
self._check_landscape_size(landscape_size)
if "unit" in collect_landscape:
unit = collect_landscape.get("unit")
self._check_unit(unit)
if "num_samples" in collect_landscape:
num_samples = collect_landscape.get("num_samples")
check_value_type("num_samples", num_samples, int)
if "create_landscape" in collect_landscape:
create_landscape = collect_landscape.get("create_landscape")
self._check_create_landscape(create_landscape)
if "intervals" in collect_landscape:
intervals = collect_landscape.get("intervals")
self._check_intervals(intervals)
def _check_json_file_data(self, json_file_data):
"""Check json file data."""
file_key = ["epoch_group", "model_params_file_map", "step_per_epoch", "unit",
"num_samples", "landscape_size", "create_landscape"]
for key in json_file_data.keys():
if key not in file_key:
raise ValueError(f'"train_metadata" json file should be {file_key}, but got the: {key}')
epoch_group = json_file_data["epoch_group"]
model_params_file_map = json_file_data["model_params_file_map"]
step_per_epoch = json_file_data["step_per_epoch"]
unit = json_file_data["unit"]
num_samples = json_file_data["num_samples"]
landscape_size = json_file_data["landscape_size"]
create_landscape = json_file_data["create_landscape"]
for _, epochs in enumerate(epoch_group.values()):
# Each epoch_group have at least three epochs.
if len(epochs) < 3:
raise ValueError(f'For "{self.__class__.__name__}", the "epoch_group" in train_metadata.json, '
f'length of each list in "epoch_group" should not be less than 3, '
f'but got: {len(epochs)}. ')
for epoch in epochs:
if str(epoch) not in model_params_file_map.keys():
raise ValueError(f'For "{self.__class__.__name__}", the "model_params_file_map" in '
f'train_metadata.json does not exist {epoch}th checkpoint in intervals.')
check_value_type('step_per_epoch', step_per_epoch, int)
self._check_landscape_size(landscape_size)
self._check_unit(unit)
check_value_type("num_samples", num_samples, int)
self._check_create_landscape(create_landscape)
class _PCA:
r"""
The internal class for computing PCA vectors.
.. math::
u, s, vt = svd(x - mean(x)),
u_i = u_i * s_i,
where :math:`mean` is the mean operator, :math:`svd` is the singular value decomposition operator.
:math:`u_i` is line :math:`i` of the :math:`u`, :math:`s_i` is column :math:`i` of the :math:`s`,
:math:`i` ranges from :math:`0` to :math:`n\_comps`.
Args:
n_comps (int): Number of principal components needed.
"""
def __init__(self, n_comps):
self._n_comps = n_comps
self._random_status = None
self._iterated_power = "auto"
self._n_oversamples = 10
@staticmethod
def _safe_dot(a, b):
"""Dot product that handle the matrix case correctly."""
if a.ndim > 2 or b.ndim > 2:
if sparse.issparse(b):
# Sparse is always 2 dimensional. Implies a is above 3 dimensional.
# [n, ..., o, p] @ [l, m] -> [n, ..., o, m]
a_2d = a.reshape(-1, a.shape[-1])
ret = a_2d @ b
ret = ret.reshape(*a.shape[:-1], b.shape[1])
elif sparse.issparse(a):
# Sparse is always 2 dimensional. Implies b is above 3 dimensional.
# [l, m] @ [n, ..., o, p, q] -> [l, n, ..., o, q]
b_ = np.rollaxis(b, -2)
b_2d = b_.reshape((b.shape[-2], -1))
ret = a @ b_2d
ret = ret.reshape(a.shape[0], *b_.shape[1:])
else:
ret = np.dot(a, b)
else:
ret = a @ b
return ret
@staticmethod
def _svd_turn(u, v, u_decision=True):
"""Confirm correction to ensure deterministic output from SVD."""
if u_decision:
# rows of v, columns of u
max_cols = np.argmax(np.abs(u), axis=0)
signs = np.sign(u[max_cols, list(range(u.shape[1]))])
v *= signs[:, np.newaxis]
u *= signs
else:
# rows of u, columns of v
max_rows = np.argmax(np.abs(v), axis=1)
signs = np.sign(v[list(range(v.shape[0])), max_rows])
v *= signs[:, np.newaxis]
u *= signs
return u, v
@staticmethod
def _check_random_status(seed):
"""Transform seed into a np.random.RandomState instance."""
if isinstance(seed, np.random.RandomState):
return seed
if seed is None or seed is np.random:
return np.random.RandomState()
if isinstance(seed, numbers.Integral):
return np.random.RandomState(seed)
raise ValueError(
"%r cannot be used to seed a numpy.random.RandomState instance" % seed
)
def compute(self, x):
"""Main method for computing principal components."""
n_components = self._n_comps
# small dimension (the shape is less than 500), and the full amount is calculated.
if max(x.shape) <= 500:
u, s, _ = self._fit_few(x)
# When dimension of x is much, truncated SVD is used for calculation.
elif 1 <= n_components < 0.8 * min(x.shape):
u, s, _ = self._fit_much(x, n_components)
# A case of n_components in (0, 1)
else:
u, s, _ = self._fit_few(x)
for i, _ in enumerate(s):
# To prevent s from being equal to 0, a small fixed noise is added.
# Adjust 1e-19 was found a good compromise for s.
if s[i] == 0:
s[i] = 1e-19
u = u[:, :self._n_comps]
u *= s[:self._n_comps]
return u
def _fit_few(self, x):
"""Compute principal components with full SVD on x, when dimension of x is few."""
mean_ = np.mean(x, axis=0)
x -= mean_
u, s, vt = linalg.svd(x, full_matrices=False)
u, vt = self._svd_turn(u, vt)
return u, s, vt
def _fit_much(self, x, n_components):
"""Compute principal components with truncated SVD on x, when dimension of x is much."""
random_state = self._check_random_status(self._random_status)
mean_ = np.mean(x, axis=0)
x -= mean_
u, s, vt = self._random_svd(x, n_components, n_oversamples=self._n_oversamples, random_state=random_state)
return u, s, vt
def _random_svd(self, m, n_components, n_oversamples=10, random_state="warn"):
"""Compute a truncated randomized SVD."""
n_random = n_components + n_oversamples
n_samples, n_features = m.shape
# Adjust 7 or 4 was found a good compromise for randomized SVD.
n_iter = 7 if n_components < 0.1 * min(m.shape) else 4
if n_samples < n_features:
m = m.T
q = self._random_range_finder(m, size=n_random, n_iter=n_iter, random_state=random_state)
# Project m to the low dimensional space using the basis vectors (q vector).
b = self._safe_dot(q.T, m)
# Compute the svd on this matrix (b matrix)
uhat, s, vt = linalg.svd(b, full_matrices=False)
del b
u = np.dot(q, uhat)
if n_samples < n_features:
u, vt = self._svd_turn(u, vt, u_decision=False)
else:
u, vt = self._svd_turn(u, vt)
if n_samples < n_features:
return vt[:n_components, :].T, s[:n_components], u[:, :n_components].T
return u[:, :n_components], s[:n_components], vt[:n_components, :]
def _random_range_finder(self, a, size, n_iter, random_state=None):
"""Computes an orthonormal matrix whose range approximates the range of A."""
random_state = self._check_random_status(random_state)
# Generate normal random vectors.
q = random_state.normal(size=(a.shape[1], size))
if a.dtype.kind == "f":
# Ensure f32 is retained as f32
q = q.astype(a.dtype, copy=False)
if n_iter <= 2:
power_iteration_normalizer = "none"
else:
power_iteration_normalizer = "LU"
# use power iterations with q to further compute the top singular vectors of a in q
for _ in range(n_iter):
if power_iteration_normalizer == "none":
q = self._safe_dot(a, q)
q = self._safe_dot(a.T, q)
elif power_iteration_normalizer == "LU":
q, _ = linalg.lu(self._safe_dot(a, q), permute_l=True)
q, _ = linalg.lu(self._safe_dot(a.T, q), permute_l=True)
# The orthogonal basis is extracted by the linear projection of Q, and the range of a is sampled.
q, _ = linalg.qr(self._safe_dot(a, q), mode="economic")
return q