如何将自定义数据集拆分为训练数据集和测试数据集?
How do I split a custom dataset into training and test datasets?
import pandas as pd
import numpy as np
import cv2
from torch.utils.data.dataset import Dataset
class CustomDatasetFromCSV(Dataset):
def __init__(self, csv_path, transform=None):
self.data = pd.read_csv(csv_path)
self.labels = pd.get_dummies(self.data['emotion']).as_matrix()
self.height = 48
self.width = 48
self.transform = transform
def __getitem__(self, index):
pixels = self.data['pixels'].tolist()
faces = []
for pixel_sequence in pixels:
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
# print(np.asarray(face).shape)
face = np.asarray(face).reshape(self.width, self.height)
face = cv2.resize(face.astype('uint8'), (self.width, self.height))
faces.append(face.astype('float32'))
faces = np.asarray(faces)
faces = np.expand_dims(faces, -1)
return faces, self.labels
def __len__(self):
return len(self.data)
这是我可以通过使用来自其他存储库的引用设法做到的。
但是,我想将此数据集拆分为训练和测试。
如何在 class 中执行此操作?还是我需要单独制作一个 class 才能做到这一点?
使用 Pytorch 的 SubsetRandomSampler
:
import torch
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
class CustomDatasetFromCSV(Dataset):
def __init__(self, csv_path, transform=None):
self.data = pd.read_csv(csv_path)
self.labels = pd.get_dummies(self.data['emotion']).as_matrix()
self.height = 48
self.width = 48
self.transform = transform
def __getitem__(self, index):
# This method should return only 1 sample and label
# (according to "index"), not the whole dataset
# So probably something like this for you:
pixel_sequence = self.data['pixels'][index]
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
face = np.asarray(face).reshape(self.width, self.height)
face = cv2.resize(face.astype('uint8'), (self.width, self.height))
label = self.labels[index]
return face, label
def __len__(self):
return len(self.labels)
dataset = CustomDatasetFromCSV(my_path)
batch_size = 16
validation_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=valid_sampler)
# Usage Example:
num_epochs = 10
for epoch in range(num_epochs):
# Train:
for batch_index, (faces, labels) in enumerate(train_loader):
# ...
从 PyTorch 0.4.1 开始,您可以使用 random_split
:
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
当前的答案是随机拆分,其缺点是不能保证每个 class 的样本数是平衡的。当您希望每个 class 的样本数量较少时,这尤其成问题。例如,MNIST 有 60,000 个示例,即每个数字 6000 个。假设您只想在训练集中每个数字 30 个示例。在这种情况下,随机拆分可能会在 classes 之间产生不平衡(一个数字比其他数字具有更多的训练数据)。所以你想确保每个数字恰好只有 30 个标签。这称为 分层抽样 。
一种方法是使用 Pytorch 中的采样器界面和 sample code is here。
另一种方法是破解你的方法:)。例如,下面是 MNIST 的简单实现,其中 ds
是 MNIST 数据集,k
是每个 class.
所需的样本数
def sampleFromClass(ds, k):
class_counts = {}
train_data = []
train_label = []
test_data = []
test_label = []
for data, label in ds:
c = label.item()
class_counts[c] = class_counts.get(c, 0) + 1
if class_counts[c] <= k:
train_data.append(data)
train_label.append(torch.unsqueeze(label, 0))
else:
test_data.append(data)
test_label.append(torch.unsqueeze(label, 0))
train_data = torch.cat(train_data)
for ll in train_label:
print(ll)
train_label = torch.cat(train_label)
test_data = torch.cat(test_data)
test_label = torch.cat(test_label)
return (TensorDataset(train_data, train_label),
TensorDataset(test_data, test_label))
您可以像这样使用此功能:
def main():
train_ds = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor()
]))
train_ds, test_ds = sampleFromClass(train_ds, 3)
请记住,大多数典型的例子都是恶意的。例如在 this page 你会发现 MNIST。一种普遍的看法是它有 60.000 张图像。砰!错误的!在 60.000 张训练图像和 10.000 张验证(测试)图像中,它有 70.000 张图像。
因此,对于规范数据集,PyTorch 的风格是为您提供已经存在的数据集。
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.optim import *
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import os
import numpy as np
import random
bs=512
t = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=(0), std=(1))]
)
dl_train = DataLoader( torchvision.datasets.MNIST('/data/mnist', download=True, train=True, transform=t),
batch_size=bs, drop_last=True, shuffle=True)
dl_valid = DataLoader( torchvision.datasets.MNIST('/data/mnist', download=True, train=False, transform=t),
batch_size=bs, drop_last=True, shuffle=True)
这是 PyTorch Subset
class 附带的 random_split
方法。请注意,此方法是 SubsetRandomSampler
.
的基础
对于 MNIST,如果我们使用 random_split
:
loader = DataLoader(
torchvision.datasets.MNIST('/data/mnist', train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.5,), (0.5,))
])),
batch_size=16, shuffle=False)
print(loader.dataset.data.shape)
test_ds, valid_ds = torch.utils.data.random_split(loader.dataset, (50000, 10000))
print(test_ds, valid_ds)
print(test_ds.indices, valid_ds.indices)
print(test_ds.indices.shape, valid_ds.indices.shape)
我们得到:
torch.Size([60000, 28, 28])
<torch.utils.data.dataset.Subset object at 0x0000020FD1880B00> <torch.utils.data.dataset.Subset object at 0x0000020FD1880C50>
tensor([ 1520, 4155, 45472, ..., 37969, 45782, 34080]) tensor([ 9133, 51600, 22067, ..., 3950, 37306, 31400])
torch.Size([50000]) torch.Size([10000])
我们的 test_ds.indices
和 valid_ds.indices
将在范围 (0, 600000)
中随机生成。但是,如果我想从 (0, 49999)
和 (50000, 59999)
获取索引序列,不幸的是我现在不能这样做,除了 方式。
万一你 运行 the MNIST benchmark 预定义了什么应该是测试,什么应该是验证数据集。
如果您想确保您的拆分平衡 类,您可以使用 sklearn
中的 train_test_split
。
假设您已将 data
包裹在 custom Dataset object 中:
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.1
BATCH_SIZE = 64
SEED = 42
# generate indices: instead of the actual data we pass in integers instead
train_indices, test_indices, _, _ = train_test_split(
range(len(data)),
data.targets,
stratify=data.targets,
test_size=TEST_SIZE,
random_state=SEED
)
# generate subset based on indices
train_split = Subset(data, train_indices)
test_split = Subset(data, test_indices)
# create batches
train_batches = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
test_batches = DataLoader(test_split, batch_size=BATCH_SIZE, shuffle=True)
如果您想在训练数据集中每个 class 最多 X 个样本,您可以使用此代码:
def stratify_split(dataset: Dataset, train_samples_per_class: int):
import collections
train_indices = []
val_indices = []
TRAIN_SAMPLES_PER_CLASS = 10
target_counter = collections.Counter()
for idx, data in enumerate(dataset):
target = data['target']
target_counter[target] += 1
if target_counter[target] <= train_samples_per_class:
train_indices.append(idx)
else:
val_indices.append(idx)
train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
return train_dataset, val_dataset
import pandas as pd
import numpy as np
import cv2
from torch.utils.data.dataset import Dataset
class CustomDatasetFromCSV(Dataset):
def __init__(self, csv_path, transform=None):
self.data = pd.read_csv(csv_path)
self.labels = pd.get_dummies(self.data['emotion']).as_matrix()
self.height = 48
self.width = 48
self.transform = transform
def __getitem__(self, index):
pixels = self.data['pixels'].tolist()
faces = []
for pixel_sequence in pixels:
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
# print(np.asarray(face).shape)
face = np.asarray(face).reshape(self.width, self.height)
face = cv2.resize(face.astype('uint8'), (self.width, self.height))
faces.append(face.astype('float32'))
faces = np.asarray(faces)
faces = np.expand_dims(faces, -1)
return faces, self.labels
def __len__(self):
return len(self.data)
这是我可以通过使用来自其他存储库的引用设法做到的。 但是,我想将此数据集拆分为训练和测试。
如何在 class 中执行此操作?还是我需要单独制作一个 class 才能做到这一点?
使用 Pytorch 的 SubsetRandomSampler
:
import torch
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
class CustomDatasetFromCSV(Dataset):
def __init__(self, csv_path, transform=None):
self.data = pd.read_csv(csv_path)
self.labels = pd.get_dummies(self.data['emotion']).as_matrix()
self.height = 48
self.width = 48
self.transform = transform
def __getitem__(self, index):
# This method should return only 1 sample and label
# (according to "index"), not the whole dataset
# So probably something like this for you:
pixel_sequence = self.data['pixels'][index]
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
face = np.asarray(face).reshape(self.width, self.height)
face = cv2.resize(face.astype('uint8'), (self.width, self.height))
label = self.labels[index]
return face, label
def __len__(self):
return len(self.labels)
dataset = CustomDatasetFromCSV(my_path)
batch_size = 16
validation_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=valid_sampler)
# Usage Example:
num_epochs = 10
for epoch in range(num_epochs):
# Train:
for batch_index, (faces, labels) in enumerate(train_loader):
# ...
从 PyTorch 0.4.1 开始,您可以使用 random_split
:
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
当前的答案是随机拆分,其缺点是不能保证每个 class 的样本数是平衡的。当您希望每个 class 的样本数量较少时,这尤其成问题。例如,MNIST 有 60,000 个示例,即每个数字 6000 个。假设您只想在训练集中每个数字 30 个示例。在这种情况下,随机拆分可能会在 classes 之间产生不平衡(一个数字比其他数字具有更多的训练数据)。所以你想确保每个数字恰好只有 30 个标签。这称为 分层抽样 。
一种方法是使用 Pytorch 中的采样器界面和 sample code is here。
另一种方法是破解你的方法:)。例如,下面是 MNIST 的简单实现,其中 ds
是 MNIST 数据集,k
是每个 class.
def sampleFromClass(ds, k):
class_counts = {}
train_data = []
train_label = []
test_data = []
test_label = []
for data, label in ds:
c = label.item()
class_counts[c] = class_counts.get(c, 0) + 1
if class_counts[c] <= k:
train_data.append(data)
train_label.append(torch.unsqueeze(label, 0))
else:
test_data.append(data)
test_label.append(torch.unsqueeze(label, 0))
train_data = torch.cat(train_data)
for ll in train_label:
print(ll)
train_label = torch.cat(train_label)
test_data = torch.cat(test_data)
test_label = torch.cat(test_label)
return (TensorDataset(train_data, train_label),
TensorDataset(test_data, test_label))
您可以像这样使用此功能:
def main():
train_ds = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor()
]))
train_ds, test_ds = sampleFromClass(train_ds, 3)
请记住,大多数典型的例子都是恶意的。例如在 this page 你会发现 MNIST。一种普遍的看法是它有 60.000 张图像。砰!错误的!在 60.000 张训练图像和 10.000 张验证(测试)图像中,它有 70.000 张图像。
因此,对于规范数据集,PyTorch 的风格是为您提供已经存在的数据集。
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.optim import *
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import os
import numpy as np
import random
bs=512
t = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=(0), std=(1))]
)
dl_train = DataLoader( torchvision.datasets.MNIST('/data/mnist', download=True, train=True, transform=t),
batch_size=bs, drop_last=True, shuffle=True)
dl_valid = DataLoader( torchvision.datasets.MNIST('/data/mnist', download=True, train=False, transform=t),
batch_size=bs, drop_last=True, shuffle=True)
这是 PyTorch Subset
class 附带的 random_split
方法。请注意,此方法是 SubsetRandomSampler
.
对于 MNIST,如果我们使用 random_split
:
loader = DataLoader(
torchvision.datasets.MNIST('/data/mnist', train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.5,), (0.5,))
])),
batch_size=16, shuffle=False)
print(loader.dataset.data.shape)
test_ds, valid_ds = torch.utils.data.random_split(loader.dataset, (50000, 10000))
print(test_ds, valid_ds)
print(test_ds.indices, valid_ds.indices)
print(test_ds.indices.shape, valid_ds.indices.shape)
我们得到:
torch.Size([60000, 28, 28])
<torch.utils.data.dataset.Subset object at 0x0000020FD1880B00> <torch.utils.data.dataset.Subset object at 0x0000020FD1880C50>
tensor([ 1520, 4155, 45472, ..., 37969, 45782, 34080]) tensor([ 9133, 51600, 22067, ..., 3950, 37306, 31400])
torch.Size([50000]) torch.Size([10000])
我们的 test_ds.indices
和 valid_ds.indices
将在范围 (0, 600000)
中随机生成。但是,如果我想从 (0, 49999)
和 (50000, 59999)
获取索引序列,不幸的是我现在不能这样做,除了
万一你 运行 the MNIST benchmark 预定义了什么应该是测试,什么应该是验证数据集。
如果您想确保您的拆分平衡 类,您可以使用 sklearn
中的 train_test_split
。
假设您已将 data
包裹在 custom Dataset object 中:
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.1
BATCH_SIZE = 64
SEED = 42
# generate indices: instead of the actual data we pass in integers instead
train_indices, test_indices, _, _ = train_test_split(
range(len(data)),
data.targets,
stratify=data.targets,
test_size=TEST_SIZE,
random_state=SEED
)
# generate subset based on indices
train_split = Subset(data, train_indices)
test_split = Subset(data, test_indices)
# create batches
train_batches = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
test_batches = DataLoader(test_split, batch_size=BATCH_SIZE, shuffle=True)
如果您想在训练数据集中每个 class 最多 X 个样本,您可以使用此代码:
def stratify_split(dataset: Dataset, train_samples_per_class: int):
import collections
train_indices = []
val_indices = []
TRAIN_SAMPLES_PER_CLASS = 10
target_counter = collections.Counter()
for idx, data in enumerate(dataset):
target = data['target']
target_counter[target] += 1
if target_counter[target] <= train_samples_per_class:
train_indices.append(idx)
else:
val_indices.append(idx)
train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
return train_dataset, val_dataset