Differences with torch.utils.data.DataLoader

torch.utils.data.DataLoader

class torch.utils.data.DataLoader(
    dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
    num_workers=0, collate_fn=None, pin_memory=False, drop_last=False,
    timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, *,
    prefetch_factor=2, persistent_workers=False)

For more information, see torch.utils.data.DataLoader.

mindspore.dataset.GeneratorDataset

class mindspore.dataset.GeneratorDataset(
    source, column_names=None, column_types=None, schema=None,
    num_samples=None, num_parallel_workers=1, shuffle=None, sampler=None,
    num_shards=None, shard_id=None, python_multiprocessing=True, max_rowsize=6)

For more information, see mindspore.dataset.GeneratorDataset。

Difference

PyTorch: DataLoader accepts a data loading class, sampler, and parameters such as batch processing, shuffling, and multi-processing options to create an iterator for data iteration. The dataset parameter can be inherited from the user-defined class of torch.utils.data.Dataset. or transfer the dataset loading class predefined in components such as torchvision.datasets, torchtext.datasets, and torchaudio.datasets modules.

MindSpore: GeneratorDataset accepts a data loading class, sampler, shuffling, sharding, and multi-processing options to create an iterator for data iteration. The function positioning of this API is the same as that of the DataLoader of PyTorch. Both these two APIs are used to load customized datasets, but the parameter lists are different. The following code examples demonstrate how to use two APIs to implement the same function.

Categories	Subcategories	PyTorch	MindSpore	Difference
Parameter	Parameter 1	dataset	source	Object that defines the dataset loading logic
	Parameter 2	batch_size	-	Supported by `mindspore.dataset.batch` operation
	Parameter 3	shuffle	shuffle	-
	Parameter 4	sampler	sampler	-
	Parameter 5	batch_sampler	-	Not supported by MindSpore
	Parameter 6	num_workers	num_parallel_workers	-
	Parameter 7	collate_fn	-	Supported by `mindspore.dataset.batch` operation
	Parameter 8	pin_memory	-	Not supported by MindSpore
	Parameter 9	drop_last	-	Supported by `mindspore.dataset.batch` operation
	Parameter 10	timeout	-	Not supported by MindSpore
	Parameter 11	worker_init_fn	-	Not supported by MindSpore
	Parameter 12	multiprocessing_context	-	Specify the multiprocessing context, not supported by MindSpore
	Parameter 13	generator	-	Random index generator not supported by MindSpore
	Parameter 14	prefetch_factor	-	Supported by `mindspore.dataset.config.set_prefetch_size`
	Parameter 15	persistent_workers	-	If shutdown the data loader after it has been consumed once, supported by `num_epoch` in `create_tuple_iterator` in MindSpore. Set `num_epoch` > 1 is same with set `persistent_workers` to True
	Parameter 16	-	column_names	Name of the column generated by the dataset
	Parameter 17	-	column_types	Specifies the data type of each data column in the generated data set
	Parameter 18	-	schema	Data format policy, which is used to specify the data type and data dimension of the data column to be read
	Parameter 19	-	num_samples	Specifies the number of samples to be read from the data set
	Parameter 20	-	num_shards	Specifies the number of shards to be divided into a dataset during distributed training
	Parameter 21	-	shard_id	Specifies the shard ID used during distributed training
	Parameter 22	-	python_multiprocessing	Specifies whether to enable the Python multi-process mode to accelerate computing
	Parameter 23	-	max_rowsize	Maximum space allocated by the shared memory when data is replicated between multiple processes

Code Example 1

Define an iterative dataset class and a random access dataset class, and load them through DataLoader/GeneratorDataset. Note that the default behavior of the shuffle parameter for DataLoader is False, and GeneratorDataset is True.

# Torch
import torch

class MyIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, start, end):
        super(MyIterableDataset).__init__()
        self.start = start
        self.end = end
    def __iter__(self):
        return iter(range(self.start, self.end))

ds = MyIterableDataset(start=3, end=7)
# Single-process loading
print(list(torch.utils.data.DataLoader(ds, num_workers=0, shuffle=False)))
# Out: [tensor([3]), tensor([4]), tensor([5]), tensor([6])]

class MyMapDataset(torch.utils.data.Dataset):
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
# Single-process loading
print(list(torch.utils.data.DataLoader(ds)))
# Out: [tensor([1]), tensor([2]), tensor([3]), tensor([4])]

# MindSpore
import mindspore as ms

class MyIterableDataset():
    def __init__(self, start, end):
        self.start = start
        self.end = end
    def __iter__(self):
        return iter(range(self.start, self.end))

ds = MyIterableDataset(start=3, end=7)
# Single-process loading
print(list(ms.dataset.GeneratorDataset(ds, column_names=["data"], num_parallel_workers=1, shuffle=False)))
# Out: [[Tensor(shape=[], dtype=Int64, value= 3)], [Tensor(shape=[], dtype=Int64, value= 4)], [Tensor(shape=[], dtype=Int64, value= 5)], [Tensor(shape=[], dtype=Int64, value= 6)]]

class MyMapDataset():
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
# Single-process loading
print(list(ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)))
# Out: [[Tensor(shape=[], dtype=Int64, value= 1)], [Tensor(shape=[], dtype=Int64, value= 2)], [Tensor(shape=[], dtype=Int64, value= 3)], [Tensor(shape=[], dtype=Int64, value= 4)]]

Code Example 2

Define a dataloader and apply batch operation with batch size 2.

# Torch
import torch

class MyMapDataset(torch.utils.data.Dataset):
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4, 5]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
dataloader = torch.utils.data.DataLoader(ds, batch_size=2, drop_last=True)
print(list(dataloader))
# Out: [tensor([1, 2]), tensor([3, 4])]

# MindSpore
import mindspore as ms

class MyMapDataset():
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4, 5]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
dataloader = ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)
dataloader = dataloader.batch(2, drop_remainder=True)
print(list(dataloader))
# Out: [[Tensor(shape=[2], dtype=Int64, value= [1, 2])], [Tensor(shape=[2], dtype=Int64, value= [3, 4])]]

Code Example 3

Define a dataloader and apply collate_fn during batch operation.

# Torch
import torch

class MyMapDataset(torch.utils.data.Dataset):
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = torch.Tensor([1, 2, 3, 4, 5])
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

def my_collate_fn(batch):
    for i, _ in enumerate(batch):
        batch[i] = batch[i] + 2
    return torch.stack(batch)

ds = MyMapDataset()
dataloader = torch.utils.data.DataLoader(ds, batch_size=2, drop_last=True, collate_fn=my_collate_fn)
print(list(dataloader))
# Out: [tensor([3., 4.]), tensor([5., 6.])]

# MindSpore
import mindspore as ms
import numpy as np

class MyMapDataset():
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4, 5]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

def my_collate_fn(batch, batchinfo):
    for i, _ in enumerate(batch):
        batch[i] = batch[i] + 2
    return np.stack(batch),

ds = MyMapDataset()
dataloader = ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)
dataloader = dataloader.batch(2, drop_remainder=True, per_batch_map=my_collate_fn)
print(list(dataloader))
# Out: [[Tensor(shape=[2], dtype=Int64, value= [3, 4])], [Tensor(shape=[2], dtype=Int64, value= [5, 6])]]