# Copyright 2019-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Functions to support dataset serialize and deserialize.
"""
import json
import os
from mindspore import log as logger
from . import datasets as de
[docs]def serialize(dataset, json_filepath=""):
"""
Serialize dataset pipeline into a JSON file.
Note:
Currently some Python objects are not supported to be serialized.
Some examples of unsupported objects are callable user-defined Python functions (Python UDFs) and
GeneratorDataset.
For such unsupported objects, partially serialized JSON output is produced
for which later deserialization, pipeline execution of the deserialized JSON file
and/or re-serialization of the deserialized pipeline may result in an error.
For example, serialization of callable user-defined Python functions (Python UDFs) is not supported,
and a warning results on serialization. Any produced serialized JSON file output for this dataset pipeline
is not valid to be deserialized.
Args:
dataset (Dataset): The starting node.
json_filepath (str): The filepath where a serialized JSON file will be generated (default="").
Returns:
Dict, The dictionary contains the serialized dataset graph.
Raises:
OSError: Cannot open a file
Examples:
>>> dataset = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
>>> one_hot_encode = transforms.OneHot(10) # num_classes is input argument
>>> dataset = dataset.map(operations=one_hot_encode, input_columns="label")
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
>>> # serialize it to JSON file
>>> serialized_data = ds.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json")
"""
return dataset.to_json(json_filepath)
[docs]def deserialize(input_dict=None, json_filepath=None):
"""
Construct dataset pipeline from a JSON file produced by dataset serialize function.
Args:
input_dict (dict): A Python dictionary containing a serialized dataset graph (default=None).
json_filepath (str): A path to the JSON file (default=None).
Returns:
de.Dataset or None if error occurs.
Raises:
OSError: Can not open the JSON file.
Examples:
>>> dataset = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
>>> one_hot_encode = transforms.OneHot(10) # num_classes is input argument
>>> dataset = dataset.map(operations=one_hot_encode, input_columns="label")
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
>>> # Case 1: to/from JSON file
>>> serialized_data = ds.serialize(dataset, json_filepath="/path/to/mnist_dataset_pipeline.json")
>>> deserialized_dataset = ds.deserialize(json_filepath="/path/to/mnist_dataset_pipeline.json")
>>> # Case 2: to/from Python dictionary
>>> serialized_data = ds.serialize(dataset)
>>> deserialized_dataset = ds.deserialize(input_dict=serialized_data)
"""
data = None
if input_dict:
data = de.DeserializedDataset(input_dict)
if json_filepath:
data = de.DeserializedDataset(json_filepath)
return data
def expand_path(node_repr, key, val):
"""Convert relative to absolute path."""
if isinstance(val, list):
node_repr[key] = [os.path.abspath(file) for file in val]
else:
node_repr[key] = os.path.abspath(val)
[docs]def show(dataset, indentation=2):
"""
Write the dataset pipeline graph to logger.info file.
Args:
dataset (Dataset): The starting node.
indentation (int, optional): The indentation used by the JSON print.
Do not indent if indentation is None (default=2).
Examples:
>>> dataset = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
>>> one_hot_encode = transforms.OneHot(10)
>>> dataset = dataset.map(operations=one_hot_encode, input_columns="label")
>>> dataset = dataset.batch(batch_size=10, drop_remainder=True)
>>> ds.show(dataset)
"""
pipeline = dataset.to_json()
logger.info(json.dumps(pipeline, indent=indentation))
[docs]def compare(pipeline1, pipeline2):
"""
Compare if two dataset pipelines are the same.
Args:
pipeline1 (Dataset): a dataset pipeline.
pipeline2 (Dataset): a dataset pipeline.
Returns:
Whether pipeline1 is equal to pipeline2.
Examples:
>>> pipeline1 = ds.MnistDataset(mnist_dataset_dir, num_samples=100)
>>> pipeline2 = ds.Cifar10Dataset(cifar10_dataset_dir, num_samples=100)
>>> res = ds.compare(pipeline1, pipeline2)
"""
return pipeline1.to_json() == pipeline2.to_json()