为什么这个神经网络在 MNIST 上表现不佳?
Why is this nerual network performing poorly on MNIST?
你好,我正在 pytorch 中构建一个神经网络来对 MNIST 进行分类,对于我来说,我似乎无法找出为什么这个网络的精度不能超过 7%。任何指导都会很好。
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from keras.datasets import mnist
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.astype("float32")/255
X_test = X_test.astype("float32")/255
X_train = X_train.reshape(X_train.shape[0],(X_train.shape[1] * X_train.shape[2]));
X_test = X_test.reshape(X_test.shape[0],(X_test.shape[1] * X_test.shape[2]));
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lin_1 = nn.Linear(784, 128)
self.lin_2 = nn.Linear(128, 64)
self.lin_3 = nn.Linear(64, 10)
def forward(self,x) :
x = self.lin_1(x)
x = torch.relu(x)
x = self.lin_2(x)
x = torch.relu(x)
x = self.lin_3(x)
x = torch.softmax(x, dim=0)
return x
net = Net();
loss = torch.nn.CrossEntropyLoss();
optimizer = torch.optim.SGD(net.parameters(),lr = 0.01);
X_train = torch.from_numpy(X_train);
X_test = torch.from_numpy(X_test);
y_train = torch.from_numpy(Y_train);
y_test = torch.from_numpy(Y_test)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu");
X_train.to(device);
X_test.to(device);
y_train.to(device);
y_test.to(device);
net.to(device);
loss.to(device);
y_train = y_train.type(torch.long)
y_test = y_test.type(torch.long)
net.train()
for epoch in range(10):
#pred = torch.max(net(X_train),1);
pred = net(X_train.to(device));
train_loss = loss(pred,y_train.to(device));
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
net.eval()
pred = torch.max(net(X_test.to(device)),1)[1];
print('The accuracy for pytorch is ' , accuracy_score(y_test.cpu().numpy(),pred.cpu().numpy()));
我觉得我必须以某种方式转换数据。这就是为什么我将训练和测试数据除以 255,并且网络除了输入 float 和输出 long。
这是我在没有pytorch的情况下制作的numpy版本
from keras.datasets import mnist
from keras.utils.np_utils import to_categorical
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.astype("float32")/255
X_test = X_test.astype("float32")/255
X_train = X_train.reshape(X_train.shape[0],(X_train.shape[1] * X_train.shape[2]));
X_test = X_test.reshape(X_test.shape[0],(X_test.shape[1] * X_test.shape[2]));
Y_train = to_categorical(Y_train);
Y_test = to_categorical(Y_test)
import numpy as np
print(Y_test.shape)
class DNN():
def __init__(self, sizes, epochs=10, lr = 0.01):
self.sizes = sizes
self.epochs = epochs
self.lr = lr
self.params = self.initialization();
def ReLu(self, x, derivative=False):
if derivative:
return 1. * (x > 0)
else:
return x * (x > 0)
def softmax(self, x, derivative=False):
# Numerically stable with large exponentials
exps = np.exp(x - x.max())
if derivative:
return exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0))
return exps / np.sum(exps, axis=0)
def initialization(self):
# number of nodes in each layer
input_layer=self.sizes[0]
hidden_1=self.sizes[1]
hidden_2=self.sizes[2]
output_layer=self.sizes[3]
params = {
"W1":np.random.randn(hidden_1, input_layer) * np.sqrt(1. / hidden_1),
"W2":np.random.randn(hidden_2, hidden_1) * np.sqrt(1. / hidden_2),
"W3":np.random.randn(output_layer, hidden_2) * np.sqrt(1. / output_layer)
}
return params
def forward (self,X_train):
self.params["X0"] = X_train;
self.params["Z1"] = np.dot(self.params["W1"], self.params["X0"])
self.params['X1'] = self.ReLu(self.params["Z1"])
self.params['Z2'] = np.dot(self.params["W2"], self.params["X1"])
self.params["X2"] = self.ReLu(self.params["Z2"])
self.params["Z3"] = np.dot(self.params["W3"], self.params["X2"])
self.params["X3"] = self.softmax(self.params["Z3"])
return self.params["X3"]
def backpropagation (self, Y_train, output):
update = {};
error = 2 * (output - Y_train) / output.shape[0] * self.softmax(self.params["Z3"], derivative=True)
update["W3"] = np.outer(error, self.params["X2"])
error = np.dot(self.params["W3"].T, error) * self.ReLu(self.params["Z2"], derivative=True)
update["W2"] = np.outer(error, self.params["X1"])
error = np.dot(self.params["W2"].T, error) * self.ReLu(self.params["Z1"], derivative=True)
update["W1"] = np.outer(error, self.params["X0"])
return update
def updateParams (self,update):
for key, value in update.items():
#print(key)
self.params[key] -= self.lr * value
def test_accuracy(self, X_test, Y_train):
predictions = []
for i in range(len(X_test)):
output = self.forward(X_test[i])
pred = np.argmax(output)
predictions.append(pred == np.argmax(Y_train[i]))
return np.mean(predictions)
def train(self, X_train, Y_train):
for epoch in range(self.epochs):
print("epoch ", epoch)
for i in range(len(X_train)):
output = self.forward(X_train[i])
update = self.backpropagation(Y_train[i], output)
self.updateParams(update)
dnn = DNN(sizes=[784, 200, 50, 10],epochs=10)
dnn.train(X_train, Y_train)
print("The accuracy of the numpy network on the test dataset is ", dnn.test_accuracy(X_test,Y_test))
好吧,我可以立即看出您提供的代码存在几个问题:
请检查documentation for PyTorch's cross entropy loss function。如果您阅读它,您会注意到 torch.nn.CrossEntropyLoss
在内部执行 softmax 函数。这意味着如果您使用 nn.CrossEntropyLoss
,您不应该真正使用另一个 torch.softmax
作为输出激活。如果出于某种原因你想在输出层使用 softmax,你应该考虑使用 nn.NLLLoss
代替。如果您查看我在下面发布的图片,只需删除 x = torch.softmax(x, dim=0)
就会导致损失下降,而使用它会导致损失相同(因此很糟糕)。
你训练的轮数太少了。我尝试 运行 你的代码有 3,000 个 epoch 而不是 10 个,最终性能是 0.9028 而不是原来的 0.1038。您还可以看到,与原始实现(第二张图)相比,损失值下降了很多。
编辑
查看您的 NumPy 代码后,问题变得更加清晰。我的第二点本质上仍然成立:你的模型训练还不够。我有点错误地使用了上面的术语“时代”,但我真正的意思是“步骤”。
如果您查看 NumPy 代码,您会发现有两个 for 循环:外部循环是迭代次数,内部循环遍历训练数据。您显然正在使用 10 个时期的单批训练。这意味着您在整个过程中总共要更新模型的参数 600,000 次(60,000 个训练样本 * 10 个时期)。对于您的 PyTorch 代码,您将在一批中提供整个训练数据并训练十个时期。这意味着您只更新了十次参数。
如果您将 PyTorch 代码修改为:
for epoch in range(10):
net.train()
for idx, _ in enumerate(X_train):
prediction = net(X_train[idx].to(device))
train_loss = loss(prediction.unsqueeze(0), y_train[idx].unsqueeze(0).to(device))
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
net.eval()
prediction = torch.max(net(X_test.to(device)), 1)[1]
accuracy = accuracy_score(y_test,cpu().numpy(), prediction.cpu().numpy())
print(f"Epoch {epoch + 1} test accuracy is {accuracy}.")
然后你会注意到模型只需要两个 epoch 就可以达到 96% 的准确率。
你好,我正在 pytorch 中构建一个神经网络来对 MNIST 进行分类,对于我来说,我似乎无法找出为什么这个网络的精度不能超过 7%。任何指导都会很好。
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from keras.datasets import mnist
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.astype("float32")/255
X_test = X_test.astype("float32")/255
X_train = X_train.reshape(X_train.shape[0],(X_train.shape[1] * X_train.shape[2]));
X_test = X_test.reshape(X_test.shape[0],(X_test.shape[1] * X_test.shape[2]));
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.lin_1 = nn.Linear(784, 128)
self.lin_2 = nn.Linear(128, 64)
self.lin_3 = nn.Linear(64, 10)
def forward(self,x) :
x = self.lin_1(x)
x = torch.relu(x)
x = self.lin_2(x)
x = torch.relu(x)
x = self.lin_3(x)
x = torch.softmax(x, dim=0)
return x
net = Net();
loss = torch.nn.CrossEntropyLoss();
optimizer = torch.optim.SGD(net.parameters(),lr = 0.01);
X_train = torch.from_numpy(X_train);
X_test = torch.from_numpy(X_test);
y_train = torch.from_numpy(Y_train);
y_test = torch.from_numpy(Y_test)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu");
X_train.to(device);
X_test.to(device);
y_train.to(device);
y_test.to(device);
net.to(device);
loss.to(device);
y_train = y_train.type(torch.long)
y_test = y_test.type(torch.long)
net.train()
for epoch in range(10):
#pred = torch.max(net(X_train),1);
pred = net(X_train.to(device));
train_loss = loss(pred,y_train.to(device));
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
net.eval()
pred = torch.max(net(X_test.to(device)),1)[1];
print('The accuracy for pytorch is ' , accuracy_score(y_test.cpu().numpy(),pred.cpu().numpy()));
我觉得我必须以某种方式转换数据。这就是为什么我将训练和测试数据除以 255,并且网络除了输入 float 和输出 long。
这是我在没有pytorch的情况下制作的numpy版本
from keras.datasets import mnist
from keras.utils.np_utils import to_categorical
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.astype("float32")/255
X_test = X_test.astype("float32")/255
X_train = X_train.reshape(X_train.shape[0],(X_train.shape[1] * X_train.shape[2]));
X_test = X_test.reshape(X_test.shape[0],(X_test.shape[1] * X_test.shape[2]));
Y_train = to_categorical(Y_train);
Y_test = to_categorical(Y_test)
import numpy as np
print(Y_test.shape)
class DNN():
def __init__(self, sizes, epochs=10, lr = 0.01):
self.sizes = sizes
self.epochs = epochs
self.lr = lr
self.params = self.initialization();
def ReLu(self, x, derivative=False):
if derivative:
return 1. * (x > 0)
else:
return x * (x > 0)
def softmax(self, x, derivative=False):
# Numerically stable with large exponentials
exps = np.exp(x - x.max())
if derivative:
return exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0))
return exps / np.sum(exps, axis=0)
def initialization(self):
# number of nodes in each layer
input_layer=self.sizes[0]
hidden_1=self.sizes[1]
hidden_2=self.sizes[2]
output_layer=self.sizes[3]
params = {
"W1":np.random.randn(hidden_1, input_layer) * np.sqrt(1. / hidden_1),
"W2":np.random.randn(hidden_2, hidden_1) * np.sqrt(1. / hidden_2),
"W3":np.random.randn(output_layer, hidden_2) * np.sqrt(1. / output_layer)
}
return params
def forward (self,X_train):
self.params["X0"] = X_train;
self.params["Z1"] = np.dot(self.params["W1"], self.params["X0"])
self.params['X1'] = self.ReLu(self.params["Z1"])
self.params['Z2'] = np.dot(self.params["W2"], self.params["X1"])
self.params["X2"] = self.ReLu(self.params["Z2"])
self.params["Z3"] = np.dot(self.params["W3"], self.params["X2"])
self.params["X3"] = self.softmax(self.params["Z3"])
return self.params["X3"]
def backpropagation (self, Y_train, output):
update = {};
error = 2 * (output - Y_train) / output.shape[0] * self.softmax(self.params["Z3"], derivative=True)
update["W3"] = np.outer(error, self.params["X2"])
error = np.dot(self.params["W3"].T, error) * self.ReLu(self.params["Z2"], derivative=True)
update["W2"] = np.outer(error, self.params["X1"])
error = np.dot(self.params["W2"].T, error) * self.ReLu(self.params["Z1"], derivative=True)
update["W1"] = np.outer(error, self.params["X0"])
return update
def updateParams (self,update):
for key, value in update.items():
#print(key)
self.params[key] -= self.lr * value
def test_accuracy(self, X_test, Y_train):
predictions = []
for i in range(len(X_test)):
output = self.forward(X_test[i])
pred = np.argmax(output)
predictions.append(pred == np.argmax(Y_train[i]))
return np.mean(predictions)
def train(self, X_train, Y_train):
for epoch in range(self.epochs):
print("epoch ", epoch)
for i in range(len(X_train)):
output = self.forward(X_train[i])
update = self.backpropagation(Y_train[i], output)
self.updateParams(update)
dnn = DNN(sizes=[784, 200, 50, 10],epochs=10)
dnn.train(X_train, Y_train)
print("The accuracy of the numpy network on the test dataset is ", dnn.test_accuracy(X_test,Y_test))
好吧,我可以立即看出您提供的代码存在几个问题:
请检查documentation for PyTorch's cross entropy loss function。如果您阅读它,您会注意到
torch.nn.CrossEntropyLoss
在内部执行 softmax 函数。这意味着如果您使用nn.CrossEntropyLoss
,您不应该真正使用另一个torch.softmax
作为输出激活。如果出于某种原因你想在输出层使用 softmax,你应该考虑使用nn.NLLLoss
代替。如果您查看我在下面发布的图片,只需删除x = torch.softmax(x, dim=0)
就会导致损失下降,而使用它会导致损失相同(因此很糟糕)。你训练的轮数太少了。我尝试 运行 你的代码有 3,000 个 epoch 而不是 10 个,最终性能是 0.9028 而不是原来的 0.1038。您还可以看到,与原始实现(第二张图)相比,损失值下降了很多。
编辑
查看您的 NumPy 代码后,问题变得更加清晰。我的第二点本质上仍然成立:你的模型训练还不够。我有点错误地使用了上面的术语“时代”,但我真正的意思是“步骤”。
如果您查看 NumPy 代码,您会发现有两个 for 循环:外部循环是迭代次数,内部循环遍历训练数据。您显然正在使用 10 个时期的单批训练。这意味着您在整个过程中总共要更新模型的参数 600,000 次(60,000 个训练样本 * 10 个时期)。对于您的 PyTorch 代码,您将在一批中提供整个训练数据并训练十个时期。这意味着您只更新了十次参数。
如果您将 PyTorch 代码修改为:
for epoch in range(10):
net.train()
for idx, _ in enumerate(X_train):
prediction = net(X_train[idx].to(device))
train_loss = loss(prediction.unsqueeze(0), y_train[idx].unsqueeze(0).to(device))
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
net.eval()
prediction = torch.max(net(X_test.to(device)), 1)[1]
accuracy = accuracy_score(y_test,cpu().numpy(), prediction.cpu().numpy())
print(f"Epoch {epoch + 1} test accuracy is {accuracy}.")
然后你会注意到模型只需要两个 epoch 就可以达到 96% 的准确率。