pytorch损失值不变
pytorch loss value not change
我根据这篇文章写了一个模块:http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
想法是将输入传递到多个流中,然后连接在一起并连接到 FC 层。我将源代码分为 3 个自定义模块:TextClassifyCnnNet
>> FlatCnnLayer
>> FilterLayer
过滤层:
class FilterLayer(nn.Module):
def __init__(self, filter_size, embedding_size, sequence_length, out_channels=128):
super(FilterLayer, self).__init__()
self.model = nn.Sequential(
nn.Conv2d(1, out_channels, (filter_size, embedding_size)),
nn.ReLU(inplace=True),
nn.MaxPool2d((sequence_length - filter_size + 1, 1), stride=1)
)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
def forward(self, x):
return self.model(x)
FlatCnnLayer:
class FlatCnnLayer(nn.Module):
def __init__(self, embedding_size, sequence_length, filter_sizes=[3, 4, 5], out_channels=128):
super(FlatCnnLayer, self).__init__()
self.filter_layers = nn.ModuleList(
[FilterLayer(filter_size, embedding_size, sequence_length, out_channels=out_channels) for
filter_size in filter_sizes])
def forward(self, x):
pools = []
for filter_layer in self.filter_layers:
out_filter = filter_layer(x)
# reshape from (batch_size, out_channels, h, w) to (batch_size, h, w, out_channels)
pools.append(out_filter.view(out_filter.size()[0], 1, 1, -1))
x = torch.cat(pools, dim=3)
x = x.view(x.size()[0], -1)
x = F.dropout(x, p=dropout_prob, training=True)
return x
TextClassifyCnnNet(主模块):
class TextClassifyCnnNet(nn.Module):
def __init__(self, embedding_size, sequence_length, num_classes, filter_sizes=[3, 4, 5], out_channels=128):
super(TextClassifyCnnNet, self).__init__()
self.flat_layer = FlatCnnLayer(embedding_size, sequence_length, filter_sizes=filter_sizes,
out_channels=out_channels)
self.model = nn.Sequential(
self.flat_layer,
nn.Linear(out_channels * len(filter_sizes), num_classes)
)
def forward(self, x):
x = self.model(x)
return x
def fit(net, data, save_path):
if torch.cuda.is_available():
net = net.cuda()
for param in list(net.parameters()):
print(type(param.data), param.size())
optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
X_train, X_test = data['X_train'], data['X_test']
Y_train, Y_test = data['Y_train'], data['Y_test']
X_valid, Y_valid = data['X_valid'], data['Y_valid']
n_batch = len(X_train) // batch_size
for epoch in range(1, n_epochs + 1): # loop over the dataset multiple times
net.train()
start = 0
end = batch_size
for batch_idx in range(1, n_batch + 1):
# get the inputs
x, y = X_train[start:end], Y_train[start:end]
start = end
end = start + batch_size
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
predicts = _get_predict(net, x)
loss = _get_loss(predicts, y)
loss.backward()
optimizer.step()
if batch_idx % display_step == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(x), len(X_train), 100. * batch_idx / (n_batch + 1), loss.data[0]))
# print statistics
if epoch % display_step == 0 or epoch == 1:
net.eval()
valid_predicts = _get_predict(net, X_valid)
valid_loss = _get_loss(valid_predicts, Y_valid)
valid_accuracy = _get_accuracy(valid_predicts, Y_valid)
print('\r[%d] loss: %.3f - accuracy: %.2f' % (epoch, valid_loss.data[0], valid_accuracy * 100))
print('\rFinished Training\n')
net.eval()
test_predicts = _get_predict(net, X_test)
test_loss = _get_loss(test_predicts, Y_test).data[0]
test_accuracy = _get_accuracy(test_predicts, Y_test)
print('Test loss: %.3f - Test accuracy: %.2f' % (test_loss, test_accuracy * 100))
torch.save(net.flat_layer.state_dict(), save_path)
def _get_accuracy(predicts, labels):
predicts = torch.max(predicts, 1)[1].data[0]
return np.mean(predicts == labels)
def _get_predict(net, x):
# wrap them in Variable
inputs = torch.from_numpy(x).float()
# convert to cuda tensors if cuda flag is true
if torch.cuda.is_available:
inputs = inputs.cuda()
inputs = Variable(inputs)
return net(inputs)
def _get_loss(predicts, labels):
labels = torch.from_numpy(labels).long()
# convert to cuda tensors if cuda flag is true
if torch.cuda.is_available:
labels = labels.cuda()
labels = Variable(labels)
return F.cross_entropy(predicts, labels)
似乎每个 epoch 的参数只是稍微更新一下,所有过程的准确性都保持不变。虽然在 Tensorflow 中使用相同的实现和相同的参数,但它运行正确。
我是Pytorch的新手,所以可能我的说明有问题,请帮我看看。谢谢!
P.s:我尝试使用 F.nll_loss
+ F.log_softmax
而不是 F.cross_entropy
。理论上应该return一样,但实际上打印出另一个结果(但还是错误的损失值)
我意识到 Adam 优化器中的 L2_loss 使 loss
值保持不变(我还没有在其他优化器中尝试过)。当我删除 L2_loss:
时它起作用
# optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
=== 更新(有关详细信息,请参阅上面的答案!)===
self.features = nn.Sequential(self.flat_layer)
self.classifier = nn.Linear(out_channels * len(filter_sizes), num_classes)
...
optimizer = optim.Adam([
{'params': model.features.parameters()},
{'params': model.classifier.parameters(), 'weight_decay': 0.1}
], lr=0.001)
我看到在您的原始代码中,weight_decay
项设置为 0.1
。 weight_decay
用于正则化网络参数。这个术语可能太强以至于正则化太多了。尝试减小 weight_decay
.
的值
用于计算机视觉任务中的卷积神经网络。 weight_decay
项通常设置为 5e-4
或 5e-5
。我不熟悉文本分类。这些值可能开箱即用,或者您必须通过反复试验对其进行一些调整。
让我知道它是否适合你。
就我而言,我遇到了同样的错误。在我没有 GPU 的笔记本电脑上,训练很好。当我在 GPU 上尝试时,模型在第一个 epoch 后并没有改变准确性和损失。我在 Adam 中使用 nn.CrossEntropyLoss() 。
用 SGD 改变 Adam 对我有用。
我正在分享这个,任何人都可能会遇到这个问题。
我根据这篇文章写了一个模块:http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
想法是将输入传递到多个流中,然后连接在一起并连接到 FC 层。我将源代码分为 3 个自定义模块:TextClassifyCnnNet
>> FlatCnnLayer
>> FilterLayer
过滤层:
class FilterLayer(nn.Module):
def __init__(self, filter_size, embedding_size, sequence_length, out_channels=128):
super(FilterLayer, self).__init__()
self.model = nn.Sequential(
nn.Conv2d(1, out_channels, (filter_size, embedding_size)),
nn.ReLU(inplace=True),
nn.MaxPool2d((sequence_length - filter_size + 1, 1), stride=1)
)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
def forward(self, x):
return self.model(x)
FlatCnnLayer:
class FlatCnnLayer(nn.Module):
def __init__(self, embedding_size, sequence_length, filter_sizes=[3, 4, 5], out_channels=128):
super(FlatCnnLayer, self).__init__()
self.filter_layers = nn.ModuleList(
[FilterLayer(filter_size, embedding_size, sequence_length, out_channels=out_channels) for
filter_size in filter_sizes])
def forward(self, x):
pools = []
for filter_layer in self.filter_layers:
out_filter = filter_layer(x)
# reshape from (batch_size, out_channels, h, w) to (batch_size, h, w, out_channels)
pools.append(out_filter.view(out_filter.size()[0], 1, 1, -1))
x = torch.cat(pools, dim=3)
x = x.view(x.size()[0], -1)
x = F.dropout(x, p=dropout_prob, training=True)
return x
TextClassifyCnnNet(主模块):
class TextClassifyCnnNet(nn.Module):
def __init__(self, embedding_size, sequence_length, num_classes, filter_sizes=[3, 4, 5], out_channels=128):
super(TextClassifyCnnNet, self).__init__()
self.flat_layer = FlatCnnLayer(embedding_size, sequence_length, filter_sizes=filter_sizes,
out_channels=out_channels)
self.model = nn.Sequential(
self.flat_layer,
nn.Linear(out_channels * len(filter_sizes), num_classes)
)
def forward(self, x):
x = self.model(x)
return x
def fit(net, data, save_path):
if torch.cuda.is_available():
net = net.cuda()
for param in list(net.parameters()):
print(type(param.data), param.size())
optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
X_train, X_test = data['X_train'], data['X_test']
Y_train, Y_test = data['Y_train'], data['Y_test']
X_valid, Y_valid = data['X_valid'], data['Y_valid']
n_batch = len(X_train) // batch_size
for epoch in range(1, n_epochs + 1): # loop over the dataset multiple times
net.train()
start = 0
end = batch_size
for batch_idx in range(1, n_batch + 1):
# get the inputs
x, y = X_train[start:end], Y_train[start:end]
start = end
end = start + batch_size
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
predicts = _get_predict(net, x)
loss = _get_loss(predicts, y)
loss.backward()
optimizer.step()
if batch_idx % display_step == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(x), len(X_train), 100. * batch_idx / (n_batch + 1), loss.data[0]))
# print statistics
if epoch % display_step == 0 or epoch == 1:
net.eval()
valid_predicts = _get_predict(net, X_valid)
valid_loss = _get_loss(valid_predicts, Y_valid)
valid_accuracy = _get_accuracy(valid_predicts, Y_valid)
print('\r[%d] loss: %.3f - accuracy: %.2f' % (epoch, valid_loss.data[0], valid_accuracy * 100))
print('\rFinished Training\n')
net.eval()
test_predicts = _get_predict(net, X_test)
test_loss = _get_loss(test_predicts, Y_test).data[0]
test_accuracy = _get_accuracy(test_predicts, Y_test)
print('Test loss: %.3f - Test accuracy: %.2f' % (test_loss, test_accuracy * 100))
torch.save(net.flat_layer.state_dict(), save_path)
def _get_accuracy(predicts, labels):
predicts = torch.max(predicts, 1)[1].data[0]
return np.mean(predicts == labels)
def _get_predict(net, x):
# wrap them in Variable
inputs = torch.from_numpy(x).float()
# convert to cuda tensors if cuda flag is true
if torch.cuda.is_available:
inputs = inputs.cuda()
inputs = Variable(inputs)
return net(inputs)
def _get_loss(predicts, labels):
labels = torch.from_numpy(labels).long()
# convert to cuda tensors if cuda flag is true
if torch.cuda.is_available:
labels = labels.cuda()
labels = Variable(labels)
return F.cross_entropy(predicts, labels)
似乎每个 epoch 的参数只是稍微更新一下,所有过程的准确性都保持不变。虽然在 Tensorflow 中使用相同的实现和相同的参数,但它运行正确。
我是Pytorch的新手,所以可能我的说明有问题,请帮我看看。谢谢!
P.s:我尝试使用 F.nll_loss
+ F.log_softmax
而不是 F.cross_entropy
。理论上应该return一样,但实际上打印出另一个结果(但还是错误的损失值)
我意识到 Adam 优化器中的 L2_loss 使 loss
值保持不变(我还没有在其他优化器中尝试过)。当我删除 L2_loss:
# optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
=== 更新(有关详细信息,请参阅上面的答案!)===
self.features = nn.Sequential(self.flat_layer)
self.classifier = nn.Linear(out_channels * len(filter_sizes), num_classes)
...
optimizer = optim.Adam([
{'params': model.features.parameters()},
{'params': model.classifier.parameters(), 'weight_decay': 0.1}
], lr=0.001)
我看到在您的原始代码中,weight_decay
项设置为 0.1
。 weight_decay
用于正则化网络参数。这个术语可能太强以至于正则化太多了。尝试减小 weight_decay
.
用于计算机视觉任务中的卷积神经网络。 weight_decay
项通常设置为 5e-4
或 5e-5
。我不熟悉文本分类。这些值可能开箱即用,或者您必须通过反复试验对其进行一些调整。
让我知道它是否适合你。
就我而言,我遇到了同样的错误。在我没有 GPU 的笔记本电脑上,训练很好。当我在 GPU 上尝试时,模型在第一个 epoch 后并没有改变准确性和损失。我在 Adam 中使用 nn.CrossEntropyLoss() 。 用 SGD 改变 Adam 对我有用。 我正在分享这个,任何人都可能会遇到这个问题。