Differences with torch.utils.data.DataLoader
torch.utils.data.DataLoader
class torch.utils.data.DataLoader(
dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
num_workers=0, collate_fn=None, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, *,
prefetch_factor=2, persistent_workers=False)
For more information, see torch.utils.data.DataLoader.
mindspore.dataset.GeneratorDataset
class mindspore.dataset.GeneratorDataset(
source, column_names=None, column_types=None, schema=None,
num_samples=None, num_parallel_workers=1, shuffle=None, sampler=None,
num_shards=None, shard_id=None, python_multiprocessing=True, max_rowsize=None)
For more information, see mindspore.dataset.GeneratorDataset。
Difference
PyTorch: DataLoader accepts a data loading class, sampler, and parameters such as batch processing, shuffling, and multi-processing options to create an iterator for data iteration.
The dataset
parameter can be inherited from the user-defined class of torch.utils.data.Dataset
. or transfer the dataset loading class predefined in components such as torchvision.datasets
, torchtext.datasets
, and torchaudio.datasets
modules.
MindSpore: GeneratorDataset accepts a data loading class, sampler, shuffling, sharding, and multi-processing options to create an iterator for data iteration. The function positioning of this API is the same as that of the DataLoader of PyTorch. Both these two APIs are used to load customized datasets, but the parameter lists are different. The following code examples demonstrate how to use two APIs to implement the same function.
Categories |
Subcategories |
PyTorch |
MindSpore |
Difference |
---|---|---|---|---|
Parameter |
Parameter 1 |
dataset |
source |
Object that defines the dataset loading logic |
Parameter 2 |
batch_size |
- |
Supported by |
|
Parameter 3 |
shuffle |
shuffle |
- |
|
Parameter 4 |
sampler |
sampler |
- |
|
Parameter 5 |
batch_sampler |
- |
Not supported by MindSpore |
|
Parameter 6 |
num_workers |
num_parallel_workers |
- |
|
Parameter 7 |
collate_fn |
- |
Supported by |
|
Parameter 8 |
pin_memory |
- |
Not supported by MindSpore |
|
Parameter 9 |
drop_last |
- |
Supported by |
|
Parameter 10 |
timeout |
- |
Not supported by MindSpore |
|
Parameter 11 |
worker_init_fn |
- |
Not supported by MindSpore |
|
Parameter 12 |
multiprocessing_context |
- |
Specify the multiprocessing context, not supported by MindSpore |
|
Parameter 13 |
generator |
- |
Random index generator not supported by MindSpore |
|
Parameter 14 |
prefetch_factor |
- |
Supported by |
|
Parameter 15 |
persistent_workers |
- |
If shutdown the data loader after it has been consumed once, supported by |
|
Parameter 16 |
- |
column_names |
Name of the column generated by the dataset |
|
Parameter 17 |
- |
column_types |
Specifies the data type of each data column in the generated data set |
|
Parameter 18 |
- |
schema |
Data format policy, which is used to specify the data type and data dimension of the data column to be read |
|
Parameter 19 |
- |
num_samples |
Specifies the number of samples to be read from the data set |
|
Parameter 20 |
- |
num_shards |
Specifies the number of shards to be divided into a dataset during distributed training |
|
Parameter 21 |
- |
shard_id |
Specifies the shard ID used during distributed training |
|
Parameter 22 |
- |
python_multiprocessing |
Specifies whether to enable the Python multi-process mode to accelerate computing |
|
Parameter 23 |
- |
max_rowsize |
Maximum space allocated by the shared memory when data is replicated between multiple processes |
Code Example 1
Define an iterative dataset class and a random access dataset class, and load them through DataLoader/GeneratorDataset. Note that the default behavior of the shuffle parameter for DataLoader is False, and GeneratorDataset is True.
# Torch
import torch
class MyIterableDataset(torch.utils.data.IterableDataset):
def __init__(self, start, end):
super(MyIterableDataset).__init__()
self.start = start
self.end = end
def __iter__(self):
return iter(range(self.start, self.end))
ds = MyIterableDataset(start=3, end=7)
# Single-process loading
print(list(torch.utils.data.DataLoader(ds, num_workers=0, shuffle=False)))
# Out: [tensor([3]), tensor([4]), tensor([5]), tensor([6])]
class MyMapDataset(torch.utils.data.Dataset):
def __init__(self):
super(MyMapDataset).__init__()
self.data = [1, 2, 3, 4]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
ds = MyMapDataset()
# Single-process loading
print(list(torch.utils.data.DataLoader(ds)))
# Out: [tensor([1]), tensor([2]), tensor([3]), tensor([4])]
# MindSpore
import mindspore as ms
class MyIterableDataset():
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
return iter(range(self.start, self.end))
ds = MyIterableDataset(start=3, end=7)
# Single-process loading
print(list(ms.dataset.GeneratorDataset(ds, column_names=["data"], num_parallel_workers=1, shuffle=False)))
# Out: [[Tensor(shape=[], dtype=Int64, value= 3)], [Tensor(shape=[], dtype=Int64, value= 4)], [Tensor(shape=[], dtype=Int64, value= 5)], [Tensor(shape=[], dtype=Int64, value= 6)]]
class MyMapDataset():
def __init__(self):
super(MyMapDataset).__init__()
self.data = [1, 2, 3, 4]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
ds = MyMapDataset()
# Single-process loading
print(list(ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)))
# Out: [[Tensor(shape=[], dtype=Int64, value= 1)], [Tensor(shape=[], dtype=Int64, value= 2)], [Tensor(shape=[], dtype=Int64, value= 3)], [Tensor(shape=[], dtype=Int64, value= 4)]]
Code Example 2
Define a dataloader and apply batch operation with batch size 2.
# Torch
import torch
class MyMapDataset(torch.utils.data.Dataset):
def __init__(self):
super(MyMapDataset).__init__()
self.data = [1, 2, 3, 4, 5]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
ds = MyMapDataset()
dataloader = torch.utils.data.DataLoader(ds, batch_size=2, drop_last=True)
print(list(dataloader))
# Out: [tensor([1, 2]), tensor([3, 4])]
# MindSpore
import mindspore as ms
class MyMapDataset():
def __init__(self):
super(MyMapDataset).__init__()
self.data = [1, 2, 3, 4, 5]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
ds = MyMapDataset()
dataloader = ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)
dataloader = dataloader.batch(2, drop_remainder=True)
print(list(dataloader))
# Out: [[Tensor(shape=[2], dtype=Int64, value= [1, 2])], [Tensor(shape=[2], dtype=Int64, value= [3, 4])]]
Code Example 3
Define a dataloader and apply collate_fn during batch operation.
# Torch
import torch
class MyMapDataset(torch.utils.data.Dataset):
def __init__(self):
super(MyMapDataset).__init__()
self.data = torch.Tensor([1, 2, 3, 4, 5])
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
def my_collate_fn(batch):
for i, _ in enumerate(batch):
batch[i] = batch[i] + 2
return torch.stack(batch)
ds = MyMapDataset()
dataloader = torch.utils.data.DataLoader(ds, batch_size=2, drop_last=True, collate_fn=my_collate_fn)
print(list(dataloader))
# Out: [tensor([3., 4.]), tensor([5., 6.])]
# MindSpore
import mindspore as ms
import numpy as np
class MyMapDataset():
def __init__(self):
super(MyMapDataset).__init__()
self.data = [1, 2, 3, 4, 5]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
def my_collate_fn(batch, batchinfo):
for i, _ in enumerate(batch):
batch[i] = batch[i] + 2
return np.stack(batch),
ds = MyMapDataset()
dataloader = ms.dataset.GeneratorDataset(ds, column_names=["data"], shuffle=False)
dataloader = dataloader.batch(2, drop_remainder=True, per_batch_map=my_collate_fn)
print(list(dataloader))
# Out: [[Tensor(shape=[2], dtype=Int64, value= [3, 4])], [Tensor(shape=[2], dtype=Int64, value= [5, 6])]]