mindspore.dataset.engine.obs.obs_mindrecord_dataset 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
The dataset module provide the internal Dataset API which load mindrecord files from OBS.
"""


import math
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing.managers import SyncManager
import os
import queue
import random
import sys
import time

from mindspore import log as logger
from ..datasets import Shuffle
from ...core.config import set_seed


class _Manager(SyncManager):
    pass


def _get_manager():
    """ PriorityQueue that cross threads."""

    _Manager.register("PriorityQueue", queue.PriorityQueue)
    m = _Manager()
    m.start()
    return m


def _init_cache_and_working_queue(cache, q, shard_files, local_path):
    """
    Initialize the downloading queue and local cache which store the status of local dataset file.
    """

    from .util import init_cache_and_queue

    idx = 0
    for shard_file, _, _, is_full_dataset in shard_files:
        dataset_file = os.path.basename(shard_file)
        path = os.path.join(local_path, dataset_file)
        init_cache_and_queue(cache, q, path, shard_file,
                             idx, is_full_dataset, lock_file=dataset_file)
        idx += 1
    return cache, q


def _remove_unused_dataset(local_path, num_shards, shard_id, epoch_num):
    """ Rank(rank_id mod 8 equal to 0) remove all dataset files. """

    from .config_loader import config

    if not num_shards:
        return
    # if num_shards less than or equal to 8, assume that there is only one node(server) and
    # the dataset does not need to be removed.
    if num_shards <= 8 or shard_id % 8 != 0:
        return

    sync_dir = '/cache/sync_data/' + str(epoch_num)
    while True:
        if os.path.exists(sync_dir) and len(os.listdir(sync_dir)) >= min(num_shards - 1, 7):
            break
        time.sleep(config.WARMINGUP_TIME)
        logger.info("[{} FUNCTION] Shard: {} wait for other rank ready in epoch: {}.".format(
            sys._getframe().f_code.co_name, shard_id, epoch_num))  # pylint: disable=W0212

    files = os.listdir(local_path)
    for dataset_file in files:
        if dataset_file.endswith('.db'):
            continue
        dataset_path = os.path.join(local_path, dataset_file)
        os.remove(dataset_path)

    for ready_file in os.listdir(sync_dir):
        os.remove(os.path.join(sync_dir, ready_file))


def _wait_remove_datset(num_shards, shard_id, epoch_num):
    """ Rank(rank_id mod 8 not equal to 0) wait for removing dataset files. """

    from .config_loader import config

    if not num_shards:
        return
    if num_shards <= 8 or shard_id % 8 == 0:
        return

    sync_dir = '/cache/sync_data/' + str(epoch_num)

    if not os.path.exists(sync_dir):
        try:
            os.makedirs(sync_dir)
        except FileExistsError:
            pass

    sync_file = os.path.join(sync_dir, 'ready_' + str(shard_id))
    with open(sync_file, 'w') as f:
        f.write('ok')

    while True:
        if os.path.exists(sync_dir) and not os.listdir(sync_dir):
            break
        time.sleep(config.WARMINGUP_TIME)
        logger.info("[{} FUNCTION] Shard: {} wait for removing dataset files in epoch: {}.".format(
            sys._getframe().f_code.co_name, shard_id, epoch_num))  # pylint: disable=W0212


def _init_shard_files(dataset_files, shuffle, seed, num_shards, shard_id, shard_equal_rows,
                      size_per_shard, local_path, current_epoch):
    """ Calculate the dataset files required by each sharding and the corresponding index. """

    from .config_loader import config
    from .util import detect_all_meta_files, fetch_meta_files, make_dataset_tuple, make_shard_files, make_shard_samples

    shard_files = None
    if shuffle is False or shuffle == Shuffle.INFILE:
        pass
    else:
        set_seed(seed)
        random.shuffle(dataset_files)
    if num_shards:  # distributed training
        # As each sharding has the same number of samples, need to fetch all meta files.
        if shard_equal_rows:
            if size_per_shard is None:
                if shard_id % 8 == 0:
                    fetch_meta_files(dataset_files, local_path)
                else:
                    while detect_all_meta_files(dataset_files, local_path) is False:
                        time.sleep(config.WAIT_META_TIME)
            full_dataset_size, dataset_file_size_list = make_dataset_tuple(
                dataset_files, local_path)
            size_per_shard = math.ceil(full_dataset_size / num_shards)
            shard_files = make_shard_samples(
                dataset_file_size_list, size_per_shard, shard_id)
        else:
            shard_files = make_shard_files(dataset_files, num_shards, shard_id)
    else:
        shard_files = [(dataset_file, -1, -1, True)
                       for dataset_file in dataset_files]
    logger.info("[{} FUNCTION] Shard: {} expect dataset: {} in epoch: {}.".format(
        sys._getframe().f_code.co_name, shard_id, shard_files, current_epoch))  # pylint: disable=W0212
    return shard_files, size_per_shard


def _download_work(shard_id, current_idx, local_path, cache, q):
    """ daemon process in backend. """
    from .config_loader import config
    from .util import try_load_from_obs, get_used_disk_per

    while True:
        idx, dataset_file = q.get()
        used_disk = get_used_disk_per()
        while used_disk > float(config.DISK_THRESHOLD):
            logger.info("[{} FUNCTION] Used disk space is {}%, and the disk threshold is {}%.".format(
                sys._getframe().f_code.co_name, used_disk*100,  # pylint: disable=W0212
                float(config.DISK_THRESHOLD)*100))
            retry_cnt = 0
            has_deleted = _delete_candidate_datasets(
                current_idx.value, idx, cache, q, local_path)
            while not has_deleted:
                if retry_cnt > config.MAX_RETRY:
                    logger.warning("Delete operation retries times {} has exceeded threshold {}, "
                                   "please clear enough disk space.".format(retry_cnt, config.MAX_RETRY))
                has_deleted = _delete_candidate_datasets(
                    current_idx.value, idx, cache, q, local_path)
                retry_cnt += 1
                time.sleep(config.RETRY_DELTA_TIME)
            used_disk = get_used_disk_per()

        logger.info("[{} FUNCTION] Shard: {} try to download: {}.".format(
            sys._getframe().f_code.co_name, shard_id, dataset_file))  # pylint: disable=W0212
        # update cache
        remote_path = os.path.dirname(dataset_file)
        dataset_file = os.path.basename(dataset_file)
        _, is_shared = cache[dataset_file]
        try_load_from_obs(remote_path, dataset_file, local_path)
        cache[dataset_file] = (idx, is_shared)
        logger.info("[{} FUNCTION] Shard: {} finish to download: {}.".format(
            sys._getframe().f_code.co_name, shard_id, dataset_file))  # pylint: disable=W0212


def _delete_candidate_datasets(current_idx, queue_top_idx, cache, q, local_path):
    """
    1. Try to delete all the datasets which have been loaded during the epoch.
    2. Otherwise, try to delete a low priority dataset in the epoch.
    3. As soon as the low priority data is deleted, it is placed in the download queue.
    """

    used_datasets = []
    low_priority_dataset = ''
    max_idx = -1
    delete = False
    for k, v in cache.items():
        idx, is_shared = v
        if is_shared is False and idx >= 0:
            if idx > max_idx:
                max_idx = idx
                low_priority_dataset = k
            if idx < current_idx:
                used_datasets.append(k)
    for used_dataset in used_datasets:
        dataset_path = os.path.join(local_path, used_dataset)
        if not os.path.exists(dataset_path):
            continue
        # update cache
        idx, is_shared = cache[used_dataset]
        cache[used_dataset] = (-1, is_shared)
        os.remove(dataset_path)
        delete = True
        logger.info("[{} FUNCTION] Delete used dataset file: {} and update the cache.".format(
            sys._getframe().f_code.co_name, used_dataset))  # pylint: disable=W0212

    if delete:
        return True
    if max_idx <= current_idx or max_idx <= queue_top_idx:
        return False
    dataset_path = os.path.join(local_path, low_priority_dataset)
    if not os.path.exists(dataset_path):
        return False
    # update cache
    idx, is_shared = cache[low_priority_dataset]
    cache[low_priority_dataset] = (-1, is_shared)
    os.remove(dataset_path)
    q.put((idx, low_priority_dataset))
    logger.info("[{} FUNCTION] Delete low priority dataset file: {} and update the cache.".format(
        sys._getframe().f_code.co_name, low_priority_dataset))  # pylint: disable=W0212
    return True


def _sync_up_for_obs_mindrecord_dataset(rank_id, current_epoch):
    """ Upload the synchronization file to OBS. """

    from .config_loader import config
    from .util import file_upload_to_obs

    sync_info = "download_dataset"
    job_id = os.environ.get('BATCH_JOB_ID', 'unknown')
    ready_file_name = sync_info + '_ready_' + str(rank_id) + '.txt'
    ready_dir = os.path.join(job_id, str(current_epoch) + "/")

    file_upload_to_obs(config.SYNC_OBS_PATH, ready_dir, ready_file_name)
    logger.info("[{} FUNCTION] Current rank:{}'s sync file:{} is ready for epoch:{}.".format(
        sys._getframe().f_code.co_name, rank_id, os.path.join(ready_dir, ready_file_name),  # pylint: disable=W0212
        current_epoch))


[文档]def sync_wait_for_dataset(rank_id, rank_size, current_epoch): """ Wait util the dataset files required by all devices are downloaded. Note: It should be used together with `mindspore.dataset.OBSMindDataset` and be called before each epoch. Args: rank_id(int): Rank ID of the device. rank_size(int): Rank size. current_epoch(int): Number of current epochs. Examples: >>> # Create a synchronization callback >>> import mindspore as ms >>> from mindspore.dataset import sync_wait_for_dataset >>> >>> class SyncForDataset(ms.Callback): ... def __init__(self): ... super(SyncForDataset, self).__init__() ... def epoch_begin(self, run_context): ... cb_params = run_context.original_args() ... epoch_num = cb_params.cur_epoch_num ... sync_wait_for_dataset(rank_id, rank_size, epoch_num) """ from .config_loader import config from .util import obsClient, get_bucket_and_key bucket_name, object_key = get_bucket_and_key(config.SYNC_OBS_PATH) job_id = os.environ.get('BATCH_JOB_ID', 'unknown') ready_dir = os.path.join(object_key, job_id, str(current_epoch) + "/") success = False while True: if success: break try: # no guarantee that the dir is included. resp = obsClient.listObjects(bucket_name, prefix=ready_dir) if resp.status < 300: ready_num = 0 for content in resp.body.contents: if content.key.endswith(".txt"): ready_num += 1 if ready_num >= rank_size: success = True else: logger.warning("[{} FUNCTION] OBS SDK errorCode:{}, errMsg: {}.".format( sys._getframe(), resp.errorCode, resp.errorMessage)) # pylint: disable=W0212 except Exception: # pylint: disable=W0703 import traceback logger.error(traceback.format_exc()) time.sleep(config.RETRY_DELTA_TIME) logger.info("[{} FUNCTION] Waiting for sync dir:{} and current_rank:{}, total_rank:{}, " "ready_rank:{} in epoch:{}.".format(sys._getframe().f_code.co_name, # pylint: disable=W0212 ready_dir, rank_id, rank_size, ready_num, current_epoch)) logger.info("[{} FUNCTION] Succeed to sync dir:{} and begin epoch:{}.".format( sys._getframe().f_code.co_name, ready_dir, current_epoch)) # pylint: disable=W0212
def _sync_for_obs_mindrecord_dataset(worker, shard_files, cache, num_shards, shard_id, current_epoch): """ Synchronize all shardings. """ from .config_loader import config while True: if worker.ready(): worker.get() dataset, _, _, _ = shard_files[-1] current_dataset = os.path.basename(dataset) hit_cache = cache[current_dataset][0] if hit_cache >= 0: # hit cache logger.info("[{} FUNCTION] Current_rank:{} has download:{} for epoch:{}.".format( sys._getframe().f_code.co_name, shard_id, dataset, current_epoch)) # pylint: disable=W0212 _sync_up_for_obs_mindrecord_dataset(shard_id, current_epoch) break time.sleep(config.WARMINGUP_TIME) logger.info("[{} FUNCTION] Current_rank:{} wait for downloading:{} in epoch:{}.".format( sys._getframe().f_code.co_name, shard_id, dataset, current_epoch)) # pylint: disable=W0212 sync_wait_for_dataset(shard_id, num_shards, current_epoch) class MindRecordFromOBS: """ Internal class which load remote dataset files from OBS. """ def __init__(self, dataset_files, columns_list, shuffle, num_shards, shard_id, shard_equal_rows, local_path): self._dataset_files = dataset_files self._columns_list = columns_list self._num_shards = num_shards self._shard_id = shard_id self._shard_equal_rows = shard_equal_rows self._local_path = os.path.realpath(local_path) self._shuffle = Shuffle.GLOBAL if shuffle is True else shuffle from .config_loader import config self._epoch_seed = config.SEED self._file_seed = config.SEED self._size_per_shard = None self._curr_epoch = 1 self._curr_step = 1 self._shard_files, self._size_per_shard = _init_shard_files(self._dataset_files, self._shuffle, self._epoch_seed, self._num_shards, self._shard_id, self._shard_equal_rows, self._size_per_shard, self._local_path, self._curr_epoch) m = _get_manager() self._queue = m.PriorityQueue() self._cache = m.dict() self._index = 0 self._current_idx = m.Value('i', self._index) self._cache, self._queue = _init_cache_and_working_queue( self._cache, self._queue, self._shard_files, self._local_path) self._index = 0 self._first_epoch = True self._iteration = None self._cache_miss_times = 0 self._pool = ThreadPool(processes=1) self._worker = self._pool.apply_async( _download_work, (self._shard_id, self._current_idx, self._local_path, self._cache, self._queue)) _sync_for_obs_mindrecord_dataset( self._worker, self._shard_files, self._cache, self._num_shards, self._shard_id, self._curr_epoch) def __next__(self): from .config_loader import config from ..datasets_standard_format import MindDataset from .util import make_sampler if self._iteration: try: self._curr_step += 1 return next(self._iteration) except StopIteration: self._index += 1 self._current_idx.value = self._index self._iteration = None if self._index >= len(self._shard_files): self._first_epoch = False self._curr_epoch += 1 self._curr_step = 0 raise StopIteration return next(self) else: f, start, end, is_full_dataset = self._shard_files[self._index] current_dataset = os.path.basename(f) hit_cache = self._cache[current_dataset][0] if hit_cache >= 0: # hit cache self._cache_miss_times = 0 # launch pipeline set_seed(self._file_seed) sampler = make_sampler( self._shuffle, is_full_dataset, start, end) self._file_seed += 1 path = os.path.join(self._local_path, current_dataset) logger.info("[{} FUNCTION] Shard:{} start to load dataset:{} in epoch:{}.".format( sys._getframe().f_code.co_name, self._shard_id, path, self._curr_epoch)) # pylint: disable=W0212 self._iteration = MindDataset(dataset_files=[path], columns_list=self._columns_list, sampler=sampler, shuffle=None).create_tuple_iterator(num_epochs=1, output_numpy=True) else: # cache miss self._cache_miss_times += 1 logger.info("[{} FUNCTION] Cache miss in shard {} for times {}, expect dataset {}.".format( sys._getframe().f_code.co_name, self._shard_id, self._cache_miss_times, # pylint: disable=W0212 current_dataset)) time.sleep(self._cache_miss_times * config.WAIT_STEP_TIME) return next(self) def __iter__(self): if self._first_epoch: self._index = 0 self._current_idx.value = self._index self._iteration = None return self self._index = 0 self._current_idx.value = self._index self._epoch_seed += 1 self._iteration = None self._shard_files, self._size_per_shard = _init_shard_files(self._dataset_files, self._shuffle, self._epoch_seed, self._num_shards, self._shard_id, self._shard_equal_rows, self._size_per_shard, self._local_path, self._curr_epoch) self._cache.clear() # reset queue try: while True: self._queue.get_nowait() except queue.Empty: pass _remove_unused_dataset( self._local_path, self._num_shards, self._shard_id, self._curr_epoch) _wait_remove_datset(self._num_shards, self._shard_id, self._curr_epoch) self._cache, self._queue = _init_cache_and_working_queue( self._cache, self._queue, self._shard_files, self._local_path) _sync_for_obs_mindrecord_dataset(self._worker, self._shard_files, self._cache, self._num_shards, self._shard_id, self._curr_epoch) return self def __len__(self): from .util import fetch_meta_files, make_dataset_tuple if self._size_per_shard is not None: return self._size_per_shard dataset_files = [] for dataset_file, _, _, _ in self._shard_files: dataset_files.append(dataset_file) fetch_meta_files(dataset_files, self._local_path) self._size_per_shard, _ = make_dataset_tuple( dataset_files, self._local_path) return len(self) def get_col_names(self): """ Get column names of Mindrecord format dataset.""" from ..datasets_standard_format import MindDataset target_dataset = None while target_dataset is None: for f, _, _, _ in self._shard_files: current_dataset = os.path.basename(f) if self._cache[current_dataset][0] >= 0: target_dataset = current_dataset path = os.path.join(self._local_path, target_dataset) _iteration = MindDataset(dataset_files=[path], shuffle=False) return _iteration.get_col_names()