如何在pytorch中手动计算整个数据集的误差?

How does one manually compute the error of the whole data set in pytorch?

我试图跟踪整个数据集的错误,并在 pytorch 中计算整个数据集的错误。我在 cifar10 pytorch 0.3.1 中写了以下内容(可重现的示例并完全包含):

import torch
from torch.autograd import Variable
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

from math import inf

from pdb import set_trace as st

def error_criterion(outputs,labels):
    max_vals, max_indices = torch.max(outputs,1)
    train_error = (max_indices != labels).sum().data[0]/max_indices.size()[0]
    return train_error

def evalaute_mdl_data_set(loss,error,net,dataloader,enable_cuda,iterations=inf):
    '''
    Evaluate the error of the model under some loss and error with a specific data set.
    '''
    running_loss,running_error = 0,0
    for i,data in enumerate(dataloader):
        if i >= iterations:
            break
        inputs, labels = extract_data(enable_cuda,data,wrap_in_variable=True)
        outputs = net(inputs)
        running_loss += loss(outputs,labels).data[0]
        running_error += error(outputs,labels)
    return running_loss/(i+1),running_error/(i+1)

def extract_data(enable_cuda,data,wrap_in_variable=False):
    inputs, labels = data
    if enable_cuda:
        inputs, labels = inputs.cuda(), labels.cuda() #TODO potential speed up?
    if wrap_in_variable:
        inputs, labels = Variable(inputs), Variable(labels)
    return inputs, labels

def train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,error_criterion, iterations=inf):
    ''' Add stats before training '''
    train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, trainloader, enable_cuda, iterations)
    test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, testloader, enable_cuda, iterations)
    print(f'[-1, -1], (train_loss: {train_loss_epoch}, train error: {train_error_epoch}) , (test loss: {test_loss_epoch}, test error: {test_error_epoch})')
    ##
    ''' Start training '''
    print('about to start training')
    for epoch in range(nb_epochs):  # loop over the dataset multiple times
        running_train_loss,running_train_error = 0.0,0.0
        for i,data_train in enumerate(trainloader):
            ''' zero the parameter gradients '''
            optimizer.zero_grad()
            ''' train step = forward + backward + optimize '''
            inputs, labels = extract_data(enable_cuda,data_train,wrap_in_variable=True)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.data[0]
            running_train_error += error_criterion(outputs,labels)
        ''' End of Epoch: collect stats'''
        train_loss_epoch, train_error_epoch = running_train_loss/(i+1), running_train_error/(i+1)
        #train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,trainloader,enable_cuda,iterations)
        test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,testloader,enable_cuda,iterations)
        print(f'[{epoch}, {i+1}], (train_loss: {train_loss_epoch}, train error: {train_error_epoch}) , (test loss: {test_loss_epoch}, test error: {test_error_epoch})')
    return train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch

class Flatten(torch.nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

def main():
    enable_cuda = True
    print('running main')
    num_workers = 0
    ''' Get Data set '''
    batch_size_test = 10000
    batch_size_train = 10000
    data_path = './data'
    transform = [transforms.ToTensor(),transforms.Normalize( (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) )]
    transform = transforms.Compose(transform)
    trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,download=False, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size_train,shuffle=True, num_workers=num_workers)
    testset = torchvision.datasets.CIFAR10(root=data_path, train=False,download=False, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size_test,shuffle=False, num_workers=num_workers)
    ''' Get model '''
    net = torch.nn.Sequential(
        torch.nn.Conv2d(3,13,5), #(in_channels, out_channels, kernel_size),
        Flatten(),
        torch.nn.Linear(28*28*13, 13),
        torch.nn.Linear(13, 10)
    )
    net.cuda()
    ''' Train '''
    nb_epochs = 10
    lr = 0.1
    err_criterion = error_criterion
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.0)
    train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,err_criterion, iterations=inf)
    ''' Done '''
    print('Done')

if __name__ == '__main__':
    main()

当我 运行 它时,我得到以下错误:

python my_cifar10.py
running main
[-1, -1], (train_loss: 2.3172860145568848, train error: 0.0054) , (test loss: 2.317185878753662, test error: 0.0038)
about to start training
[0, 5], (train_loss: 2.22599835395813, train error: 0.015160000000000002) , (test loss: 2.0623881816864014, test error: 0.0066)
[1, 5], (train_loss: 2.014406657218933, train error: 0.00896) , (test loss: 1.9619578123092651, test error: 0.0195)
[2, 5], (train_loss: 1.9428715705871582, train error: 0.01402) , (test loss: 1.918603539466858, test error: 0.0047)
[3, 5], (train_loss: 1.9434458494186402, train error: 0.01192) , (test loss: 1.9194672107696533, test error: 0.0125)
[4, 5], (train_loss: 1.8804980754852294, train error: 0.00794) , (test loss: 1.8549214601516724, test error: 0.004)
[5, 5], (train_loss: 1.8573726177215577, train error: 0.010159999999999999) , (test loss: 1.8625996112823486, test error: 0.0158)
[6, 5], (train_loss: 1.8454653739929199, train error: 0.01524) , (test loss: 1.8155865669250488, test error: 0.0122)
[7, 5], (train_loss: 1.8140610456466675, train error: 0.01066) , (test loss: 1.808283805847168, test error: 0.0101)
[8, 5], (train_loss: 1.8036894083023072, train error: 0.00832) , (test loss: 1.799634575843811, test error: 0.007)
[9, 5], (train_loss: 1.8023016452789307, train error: 0.0077399999999999995) , (test loss: 1.8030155897140503, test error: 0.0114)
Done

显然它一定是错的,因为对于一个小得离谱且简单的模型 (1 conv 2 fcs),测试误差几乎为零。

代码看起来太简单了,我不知道哪里出了问题。这几天我一直在做一些事情,改变一些事情。有什么新建议可以尝试吗?

如果您的批量大小太大,请使用您的代码

(max_indices == labels).sum()
(max_indices != labels).sum()

不加起来就是批量大小。这是因为您使用的 torch.ByteTensor 会在求和时溢出值 > 255。

使用

(max_indices != labels).int().sum()

将通过在求和之前将张量转换为 int 来解决问题。

作为 pytorch forum suggested 的一个解决方案,一个好主意是重铸事物:

def error_criterion(outputs,labels):
    max_vals, max_indices = torch.max(outputs,1)
    train_error = (max_indices != labels).float().sum()/max_indices.size()[0]
    return train_error

比较 (max_indices != labels) returns 和 torch.ByteTensor,使用您的 10000 批量大小可能会溢出。 添加一个 .float 到这一行 (max_indices != labels).float().sum() 似乎让事情消失了。

现在看起来一切正常:

$ python my_cifar10.py
running main
[-1, -1], (train_loss: 2.3301061153411866, train error: 0.9383399844169616) , (test loss: 2.3303537368774414, test error: 0.9396999478340149)
about to start training
[0, 5], (train_loss: 2.1861009120941164, train error: 0.8279399871826172) , (test loss: 2.044313907623291, test error: 0.7494999766349792)
[1, 5], (train_loss: 2.009986090660095, train error: 0.7244199872016907) , (test loss: 1.9966429471969604, test error: 0.7224999666213989)
[2, 5], (train_loss: 2.0178127527236938, train error: 0.712559974193573) , (test loss: 1.9238039255142212, test error: 0.6651999950408936)
[3, 5], (train_loss: 1.9113547801971436, train error: 0.6625399827957154) , (test loss: 1.8861572742462158, test error: 0.6486999988555908)
[4, 5], (train_loss: 1.8807836771011353, train error: 0.6485799789428711) , (test loss: 1.8632378578186035, test error: 0.6452999711036682)
[5, 5], (train_loss: 1.8648049116134644, train error: 0.6440199613571167) , (test loss: 1.875121831893921, test error: 0.652999997138977)
[6, 5], (train_loss: 1.8700860738754272, train error: 0.6511399745941162) , (test loss: 1.830731987953186, test error: 0.633899986743927)
[7, 5], (train_loss: 1.8432915449142455, train error: 0.6376399874687195) , (test loss: 1.8518757820129395, test error: 0.6491999626159668)
[8, 5], (train_loss: 1.830060887336731, train error: 0.634719979763031) , (test loss: 1.7997492551803589, test error: 0.6247000098228455)
[9, 5], (train_loss: 1.8012208938598633, train error: 0.6230199933052063) , (test loss: 1.8045140504837036, test error: 0.628600001335144)
Done