Differences with torch.utils.data.DataLoader

View Source On Gitee

torch.utils.data.DataLoader

class torch.utils.data.DataLoader(
    dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
    num_workers=0, collate_fn=None, pin_memory=False, drop_last=False,
    timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, *,
    prefetch_factor=2, persistent_workers=False)

For more information, see torch.utils.data.DataLoader.

mindspore.dataset.GeneratorDataset

class mindspore.dataset.GeneratorDataset(
    source, column_names=None, column_types=None, schema=None,
    num_samples=None, num_parallel_workers=1, shuffle=None, sampler=None,
    num_shards=None, shard_id=None, python_multiprocessing=True, max_rowsize=None)

For more information, see mindspore.dataset.GeneratorDataset

Difference

PyTorch: DataLoader accepts a data loading class, sampler, and parameters such as batch processing, shuffling, and multi-processing options to create an iterator for data iteration. The dataset parameter can be inherited from the user-defined class of torch.utils.data.Dataset. or transfer the dataset loading class predefined in components such as torchvision.datasets, torchtext.datasets, and torchaudio.datasets modules.

MindSpore: GeneratorDataset accepts a data loading class, sampler, shuffling, sharding, and multi-processing options to create an iterator for data iteration. The function positioning of this API is the same as that of the DataLoader of PyTorch. Both these two APIs are used to load customized datasets, but the parameter lists are different. The following code examples demonstrate how to use two APIs to implement the same function.

Categories

Subcategories

PyTorch

MindSpore

Difference

Parameter

Parameter 1

dataset

source

Object that defines the dataset loading logic

Parameter 2

batch_size

-

Supported by mindspore.dataset.batch operation

Parameter 3

shuffle

shuffle

-

Parameter 4

sampler

sampler

-

Parameter 5

batch_sampler

-

Not supported by MindSpore

Parameter 6

num_workers

num_parallel_workers

-

Parameter 7

collate_fn

-

Supported by mindspore.dataset.batch operation

Parameter 8

pin_memory

-

Not supported by MindSpore

Parameter 9

drop_last

-

Supported by mindspore.dataset.batch operation

Parameter 10

timeout

-

Not supported by MindSpore

Parameter 11

worker_init_fn

-

Not supported by MindSpore

Parameter 12

multiprocessing_context

-

Specify the multiprocessing context, not supported by MindSpore

Parameter 13

generator

-

Random index generator not supported by MindSpore

Parameter 14

prefetch_factor

-

Supported by mindspore.dataset.config.set_prefetch_size

Parameter 15

persistent_workers

-

If shutdown the data loader after it has been consumed once, supported by num_epoch in create_tuple_iterator in MindSpore. Set num_epoch > 1 is same with set persistent_workers to True

Parameter 16

-

column_names

Name of the column generated by the dataset

Parameter 17

-

column_types

Specifies the data type of each data column in the generated data set

Parameter 18

-

schema

Data format policy, which is used to specify the data type and data dimension of the data column to be read

Parameter 19

-

num_samples

Specifies the number of samples to be read from the data set

Parameter 20

-

num_shards

Specifies the number of shards to be divided into a dataset during distributed training

Parameter 21

-

shard_id

Specifies the shard ID used during distributed training

Parameter 22

-

python_multiprocessing

Specifies whether to enable the Python multi-process mode to accelerate computing

Parameter 23

-

max_rowsize

Maximum space allocated by the shared memory when data is replicated between multiple processes

Code Example 1

Define an iterative dataset class and a random access dataset class, and load them through DataLoader/GeneratorDataset. Note that the default behavior of the shuffle parameter for DataLoader is False, and GeneratorDataset is True.

# Torch
import torch

class MyIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, start, end):
        super(MyIterableDataset).__init__()
        self.start = start
        self.end = end
    def __iter__(self):
        return iter(range(self.start, self.end))

ds = MyIterableDataset(start=3, end=7)
# Single-process loading
print(list(torch.utils.data.DataLoader(ds, num_workers=0, shuffle=False)))
# Out: [tensor([3]), tensor([4]), tensor([5]), tensor([6])]

class MyMapDataset(torch.utils.data.Dataset):
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
# Single-process loading
print(list(torch.utils.data.DataLoader(ds)))
# Out: [tensor([1]), tensor([2]), tensor([3]), tensor([4])]
# MindSpore
import mindspore as ms

class MyIterableDataset():
    def __init__(self, start, end):
        self.start = start
        self.end = end
    def __iter__(self):
        return iter(range(self.start, self.end))

ds = MyIterableDataset(start=3, end=7)
# Single-process loading
print(list(ms.dataset.GeneratorDataset(ds, column_names=["data"], num_parallel_workers=1, shuffle=False)))
# Out: [[Tensor(shape=[], dtype=Int64, value= 3)], [Tensor(shape=[], dtype=Int64, value= 4)], [Tensor(shape=[], dtype=Int64, value= 5)], [Tensor(shape=[], dtype=Int64, value= 6)]]

class MyMapDataset():
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
# Single-process loading
print(list(ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)))
# Out: [[Tensor(shape=[], dtype=Int64, value= 1)], [Tensor(shape=[], dtype=Int64, value= 2)], [Tensor(shape=[], dtype=Int64, value= 3)], [Tensor(shape=[], dtype=Int64, value= 4)]]

Code Example 2

Define a dataloader and apply batch operation with batch size 2.

# Torch
import torch

class MyMapDataset(torch.utils.data.Dataset):
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4, 5]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
dataloader = torch.utils.data.DataLoader(ds, batch_size=2, drop_last=True)
print(list(dataloader))
# Out: [tensor([1, 2]), tensor([3, 4])]
# MindSpore
import mindspore as ms

class MyMapDataset():
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4, 5]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

ds = MyMapDataset()
dataloader = ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)
dataloader = dataloader.batch(2, drop_remainder=True)
print(list(dataloader))
# Out: [[Tensor(shape=[2], dtype=Int64, value= [1, 2])], [Tensor(shape=[2], dtype=Int64, value= [3, 4])]]

Code Example 3

Define a dataloader and apply collate_fn during batch operation.

# Torch
import torch

class MyMapDataset(torch.utils.data.Dataset):
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = torch.Tensor([1, 2, 3, 4, 5])
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

def my_collate_fn(batch):
    for i, _ in enumerate(batch):
        batch[i] = batch[i] + 2
    return torch.stack(batch)

ds = MyMapDataset()
dataloader = torch.utils.data.DataLoader(ds, batch_size=2, drop_last=True, collate_fn=my_collate_fn)
print(list(dataloader))
# Out: [tensor([3., 4.]), tensor([5., 6.])]
# MindSpore
import mindspore as ms
import numpy as np

class MyMapDataset():
    def __init__(self):
        super(MyMapDataset).__init__()
        self.data = [1, 2, 3, 4, 5]
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)

def my_collate_fn(batch, batchinfo):
    for i, _ in enumerate(batch):
        batch[i] = batch[i] + 2
    return np.stack(batch),

ds = MyMapDataset()
dataloader = ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)
dataloader = dataloader.batch(2, drop_remainder=True, per_batch_map=my_collate_fn)
print(list(dataloader))
# Out: [[Tensor(shape=[2], dtype=Int64, value= [3, 4])], [Tensor(shape=[2], dtype=Int64, value= [5, 6])]]