如何在pytorch中手动计算整个数据集的误差?
How does one manually compute the error of the whole data set in pytorch?
我试图跟踪整个数据集的错误,并在 pytorch 中计算整个数据集的错误。我在 cifar10 pytorch 0.3.1 中写了以下内容(可重现的示例并完全包含):
import torch
from torch.autograd import Variable
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from math import inf
from pdb import set_trace as st
def error_criterion(outputs,labels):
max_vals, max_indices = torch.max(outputs,1)
train_error = (max_indices != labels).sum().data[0]/max_indices.size()[0]
return train_error
def evalaute_mdl_data_set(loss,error,net,dataloader,enable_cuda,iterations=inf):
'''
Evaluate the error of the model under some loss and error with a specific data set.
'''
running_loss,running_error = 0,0
for i,data in enumerate(dataloader):
if i >= iterations:
break
inputs, labels = extract_data(enable_cuda,data,wrap_in_variable=True)
outputs = net(inputs)
running_loss += loss(outputs,labels).data[0]
running_error += error(outputs,labels)
return running_loss/(i+1),running_error/(i+1)
def extract_data(enable_cuda,data,wrap_in_variable=False):
inputs, labels = data
if enable_cuda:
inputs, labels = inputs.cuda(), labels.cuda() #TODO potential speed up?
if wrap_in_variable:
inputs, labels = Variable(inputs), Variable(labels)
return inputs, labels
def train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,error_criterion, iterations=inf):
''' Add stats before training '''
train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, trainloader, enable_cuda, iterations)
test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, testloader, enable_cuda, iterations)
print(f'[-1, -1], (train_loss: {train_loss_epoch}, train error: {train_error_epoch}) , (test loss: {test_loss_epoch}, test error: {test_error_epoch})')
##
''' Start training '''
print('about to start training')
for epoch in range(nb_epochs): # loop over the dataset multiple times
running_train_loss,running_train_error = 0.0,0.0
for i,data_train in enumerate(trainloader):
''' zero the parameter gradients '''
optimizer.zero_grad()
''' train step = forward + backward + optimize '''
inputs, labels = extract_data(enable_cuda,data_train,wrap_in_variable=True)
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_train_loss += loss.data[0]
running_train_error += error_criterion(outputs,labels)
''' End of Epoch: collect stats'''
train_loss_epoch, train_error_epoch = running_train_loss/(i+1), running_train_error/(i+1)
#train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,trainloader,enable_cuda,iterations)
test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,testloader,enable_cuda,iterations)
print(f'[{epoch}, {i+1}], (train_loss: {train_loss_epoch}, train error: {train_error_epoch}) , (test loss: {test_loss_epoch}, test error: {test_error_epoch})')
return train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch
class Flatten(torch.nn.Module):
def forward(self, input):
return input.view(input.size(0), -1)
def main():
enable_cuda = True
print('running main')
num_workers = 0
''' Get Data set '''
batch_size_test = 10000
batch_size_train = 10000
data_path = './data'
transform = [transforms.ToTensor(),transforms.Normalize( (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) )]
transform = transforms.Compose(transform)
trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,download=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size_train,shuffle=True, num_workers=num_workers)
testset = torchvision.datasets.CIFAR10(root=data_path, train=False,download=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size_test,shuffle=False, num_workers=num_workers)
''' Get model '''
net = torch.nn.Sequential(
torch.nn.Conv2d(3,13,5), #(in_channels, out_channels, kernel_size),
Flatten(),
torch.nn.Linear(28*28*13, 13),
torch.nn.Linear(13, 10)
)
net.cuda()
''' Train '''
nb_epochs = 10
lr = 0.1
err_criterion = error_criterion
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.0)
train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,err_criterion, iterations=inf)
''' Done '''
print('Done')
if __name__ == '__main__':
main()
当我 运行 它时,我得到以下错误:
python my_cifar10.py
running main
[-1, -1], (train_loss: 2.3172860145568848, train error: 0.0054) , (test loss: 2.317185878753662, test error: 0.0038)
about to start training
[0, 5], (train_loss: 2.22599835395813, train error: 0.015160000000000002) , (test loss: 2.0623881816864014, test error: 0.0066)
[1, 5], (train_loss: 2.014406657218933, train error: 0.00896) , (test loss: 1.9619578123092651, test error: 0.0195)
[2, 5], (train_loss: 1.9428715705871582, train error: 0.01402) , (test loss: 1.918603539466858, test error: 0.0047)
[3, 5], (train_loss: 1.9434458494186402, train error: 0.01192) , (test loss: 1.9194672107696533, test error: 0.0125)
[4, 5], (train_loss: 1.8804980754852294, train error: 0.00794) , (test loss: 1.8549214601516724, test error: 0.004)
[5, 5], (train_loss: 1.8573726177215577, train error: 0.010159999999999999) , (test loss: 1.8625996112823486, test error: 0.0158)
[6, 5], (train_loss: 1.8454653739929199, train error: 0.01524) , (test loss: 1.8155865669250488, test error: 0.0122)
[7, 5], (train_loss: 1.8140610456466675, train error: 0.01066) , (test loss: 1.808283805847168, test error: 0.0101)
[8, 5], (train_loss: 1.8036894083023072, train error: 0.00832) , (test loss: 1.799634575843811, test error: 0.007)
[9, 5], (train_loss: 1.8023016452789307, train error: 0.0077399999999999995) , (test loss: 1.8030155897140503, test error: 0.0114)
Done
显然它一定是错的,因为对于一个小得离谱且简单的模型 (1 conv 2 fcs),测试误差几乎为零。
代码看起来太简单了,我不知道哪里出了问题。这几天我一直在做一些事情,改变一些事情。有什么新建议可以尝试吗?
如果您的批量大小太大,请使用您的代码
(max_indices == labels).sum()
(max_indices != labels).sum()
不加起来就是批量大小。这是因为您使用的 torch.ByteTensor
会在求和时溢出值 > 255。
使用
(max_indices != labels).int().sum()
将通过在求和之前将张量转换为 int
来解决问题。
作为 pytorch forum suggested 的一个解决方案,一个好主意是重铸事物:
def error_criterion(outputs,labels):
max_vals, max_indices = torch.max(outputs,1)
train_error = (max_indices != labels).float().sum()/max_indices.size()[0]
return train_error
比较 (max_indices != labels)
returns 和 torch.ByteTensor
,使用您的 10000
批量大小可能会溢出。
添加一个 .float
到这一行 (max_indices != labels).float().sum()
似乎让事情消失了。
现在看起来一切正常:
$ python my_cifar10.py
running main
[-1, -1], (train_loss: 2.3301061153411866, train error: 0.9383399844169616) , (test loss: 2.3303537368774414, test error: 0.9396999478340149)
about to start training
[0, 5], (train_loss: 2.1861009120941164, train error: 0.8279399871826172) , (test loss: 2.044313907623291, test error: 0.7494999766349792)
[1, 5], (train_loss: 2.009986090660095, train error: 0.7244199872016907) , (test loss: 1.9966429471969604, test error: 0.7224999666213989)
[2, 5], (train_loss: 2.0178127527236938, train error: 0.712559974193573) , (test loss: 1.9238039255142212, test error: 0.6651999950408936)
[3, 5], (train_loss: 1.9113547801971436, train error: 0.6625399827957154) , (test loss: 1.8861572742462158, test error: 0.6486999988555908)
[4, 5], (train_loss: 1.8807836771011353, train error: 0.6485799789428711) , (test loss: 1.8632378578186035, test error: 0.6452999711036682)
[5, 5], (train_loss: 1.8648049116134644, train error: 0.6440199613571167) , (test loss: 1.875121831893921, test error: 0.652999997138977)
[6, 5], (train_loss: 1.8700860738754272, train error: 0.6511399745941162) , (test loss: 1.830731987953186, test error: 0.633899986743927)
[7, 5], (train_loss: 1.8432915449142455, train error: 0.6376399874687195) , (test loss: 1.8518757820129395, test error: 0.6491999626159668)
[8, 5], (train_loss: 1.830060887336731, train error: 0.634719979763031) , (test loss: 1.7997492551803589, test error: 0.6247000098228455)
[9, 5], (train_loss: 1.8012208938598633, train error: 0.6230199933052063) , (test loss: 1.8045140504837036, test error: 0.628600001335144)
Done
我试图跟踪整个数据集的错误,并在 pytorch 中计算整个数据集的错误。我在 cifar10 pytorch 0.3.1 中写了以下内容(可重现的示例并完全包含):
import torch
from torch.autograd import Variable
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from math import inf
from pdb import set_trace as st
def error_criterion(outputs,labels):
max_vals, max_indices = torch.max(outputs,1)
train_error = (max_indices != labels).sum().data[0]/max_indices.size()[0]
return train_error
def evalaute_mdl_data_set(loss,error,net,dataloader,enable_cuda,iterations=inf):
'''
Evaluate the error of the model under some loss and error with a specific data set.
'''
running_loss,running_error = 0,0
for i,data in enumerate(dataloader):
if i >= iterations:
break
inputs, labels = extract_data(enable_cuda,data,wrap_in_variable=True)
outputs = net(inputs)
running_loss += loss(outputs,labels).data[0]
running_error += error(outputs,labels)
return running_loss/(i+1),running_error/(i+1)
def extract_data(enable_cuda,data,wrap_in_variable=False):
inputs, labels = data
if enable_cuda:
inputs, labels = inputs.cuda(), labels.cuda() #TODO potential speed up?
if wrap_in_variable:
inputs, labels = Variable(inputs), Variable(labels)
return inputs, labels
def train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,error_criterion, iterations=inf):
''' Add stats before training '''
train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, trainloader, enable_cuda, iterations)
test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, testloader, enable_cuda, iterations)
print(f'[-1, -1], (train_loss: {train_loss_epoch}, train error: {train_error_epoch}) , (test loss: {test_loss_epoch}, test error: {test_error_epoch})')
##
''' Start training '''
print('about to start training')
for epoch in range(nb_epochs): # loop over the dataset multiple times
running_train_loss,running_train_error = 0.0,0.0
for i,data_train in enumerate(trainloader):
''' zero the parameter gradients '''
optimizer.zero_grad()
''' train step = forward + backward + optimize '''
inputs, labels = extract_data(enable_cuda,data_train,wrap_in_variable=True)
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_train_loss += loss.data[0]
running_train_error += error_criterion(outputs,labels)
''' End of Epoch: collect stats'''
train_loss_epoch, train_error_epoch = running_train_loss/(i+1), running_train_error/(i+1)
#train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,trainloader,enable_cuda,iterations)
test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,testloader,enable_cuda,iterations)
print(f'[{epoch}, {i+1}], (train_loss: {train_loss_epoch}, train error: {train_error_epoch}) , (test loss: {test_loss_epoch}, test error: {test_error_epoch})')
return train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch
class Flatten(torch.nn.Module):
def forward(self, input):
return input.view(input.size(0), -1)
def main():
enable_cuda = True
print('running main')
num_workers = 0
''' Get Data set '''
batch_size_test = 10000
batch_size_train = 10000
data_path = './data'
transform = [transforms.ToTensor(),transforms.Normalize( (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) )]
transform = transforms.Compose(transform)
trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,download=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size_train,shuffle=True, num_workers=num_workers)
testset = torchvision.datasets.CIFAR10(root=data_path, train=False,download=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size_test,shuffle=False, num_workers=num_workers)
''' Get model '''
net = torch.nn.Sequential(
torch.nn.Conv2d(3,13,5), #(in_channels, out_channels, kernel_size),
Flatten(),
torch.nn.Linear(28*28*13, 13),
torch.nn.Linear(13, 10)
)
net.cuda()
''' Train '''
nb_epochs = 10
lr = 0.1
err_criterion = error_criterion
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.0)
train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,err_criterion, iterations=inf)
''' Done '''
print('Done')
if __name__ == '__main__':
main()
当我 运行 它时,我得到以下错误:
python my_cifar10.py
running main
[-1, -1], (train_loss: 2.3172860145568848, train error: 0.0054) , (test loss: 2.317185878753662, test error: 0.0038)
about to start training
[0, 5], (train_loss: 2.22599835395813, train error: 0.015160000000000002) , (test loss: 2.0623881816864014, test error: 0.0066)
[1, 5], (train_loss: 2.014406657218933, train error: 0.00896) , (test loss: 1.9619578123092651, test error: 0.0195)
[2, 5], (train_loss: 1.9428715705871582, train error: 0.01402) , (test loss: 1.918603539466858, test error: 0.0047)
[3, 5], (train_loss: 1.9434458494186402, train error: 0.01192) , (test loss: 1.9194672107696533, test error: 0.0125)
[4, 5], (train_loss: 1.8804980754852294, train error: 0.00794) , (test loss: 1.8549214601516724, test error: 0.004)
[5, 5], (train_loss: 1.8573726177215577, train error: 0.010159999999999999) , (test loss: 1.8625996112823486, test error: 0.0158)
[6, 5], (train_loss: 1.8454653739929199, train error: 0.01524) , (test loss: 1.8155865669250488, test error: 0.0122)
[7, 5], (train_loss: 1.8140610456466675, train error: 0.01066) , (test loss: 1.808283805847168, test error: 0.0101)
[8, 5], (train_loss: 1.8036894083023072, train error: 0.00832) , (test loss: 1.799634575843811, test error: 0.007)
[9, 5], (train_loss: 1.8023016452789307, train error: 0.0077399999999999995) , (test loss: 1.8030155897140503, test error: 0.0114)
Done
显然它一定是错的,因为对于一个小得离谱且简单的模型 (1 conv 2 fcs),测试误差几乎为零。
代码看起来太简单了,我不知道哪里出了问题。这几天我一直在做一些事情,改变一些事情。有什么新建议可以尝试吗?
如果您的批量大小太大,请使用您的代码
(max_indices == labels).sum()
(max_indices != labels).sum()
不加起来就是批量大小。这是因为您使用的 torch.ByteTensor
会在求和时溢出值 > 255。
使用
(max_indices != labels).int().sum()
将通过在求和之前将张量转换为 int
来解决问题。
作为 pytorch forum suggested 的一个解决方案,一个好主意是重铸事物:
def error_criterion(outputs,labels):
max_vals, max_indices = torch.max(outputs,1)
train_error = (max_indices != labels).float().sum()/max_indices.size()[0]
return train_error
比较 (max_indices != labels)
returns 和 torch.ByteTensor
,使用您的 10000
批量大小可能会溢出。
添加一个 .float
到这一行 (max_indices != labels).float().sum()
似乎让事情消失了。
现在看起来一切正常:
$ python my_cifar10.py
running main
[-1, -1], (train_loss: 2.3301061153411866, train error: 0.9383399844169616) , (test loss: 2.3303537368774414, test error: 0.9396999478340149)
about to start training
[0, 5], (train_loss: 2.1861009120941164, train error: 0.8279399871826172) , (test loss: 2.044313907623291, test error: 0.7494999766349792)
[1, 5], (train_loss: 2.009986090660095, train error: 0.7244199872016907) , (test loss: 1.9966429471969604, test error: 0.7224999666213989)
[2, 5], (train_loss: 2.0178127527236938, train error: 0.712559974193573) , (test loss: 1.9238039255142212, test error: 0.6651999950408936)
[3, 5], (train_loss: 1.9113547801971436, train error: 0.6625399827957154) , (test loss: 1.8861572742462158, test error: 0.6486999988555908)
[4, 5], (train_loss: 1.8807836771011353, train error: 0.6485799789428711) , (test loss: 1.8632378578186035, test error: 0.6452999711036682)
[5, 5], (train_loss: 1.8648049116134644, train error: 0.6440199613571167) , (test loss: 1.875121831893921, test error: 0.652999997138977)
[6, 5], (train_loss: 1.8700860738754272, train error: 0.6511399745941162) , (test loss: 1.830731987953186, test error: 0.633899986743927)
[7, 5], (train_loss: 1.8432915449142455, train error: 0.6376399874687195) , (test loss: 1.8518757820129395, test error: 0.6491999626159668)
[8, 5], (train_loss: 1.830060887336731, train error: 0.634719979763031) , (test loss: 1.7997492551803589, test error: 0.6247000098228455)
[9, 5], (train_loss: 1.8012208938598633, train error: 0.6230199933052063) , (test loss: 1.8045140504837036, test error: 0.628600001335144)
Done