Theano 中的 MLP 分类器稳定在局部最小值
MLP classifier in theano settles at local minima
我用theano写了一个MLP分类器。使用反向传播算法的训练函数如下:
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
我尝试针对 XOR 问题训练分类器。实施是
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
initialize() 方法只编译后端的所有函数,即反向传播函数、用于计算预测的前向传递函数和一些其他 theano 函数。现在,当我 运行 这段代码时,训练稳定在局部最小值。
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
训练开始时,损失约为 0.92。它稳步下降到上述值并停在那里。我尝试改变 alpha 和动量的值。我做错了什么?
P.S。
整个代码在这里:
networks.py
import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend
class Network:
def __init__(self,architecture):
self.architecture=architecture
self.layers=[]
self.weights=[]
self.bias=[]
def __str__(self):
banner=''
for i in range(len(self.weights)):
banner+=str(self.weights[i])+'\n'
banner+=str(self.bias[i])+'\n'
return banner
class FeedForwardNetwork(Network):
def initialize(self):
self.layers.append(InputLayer(units=self.architecture[0]))
for i in range(1,len(self.architecture[:-1])):
self.layers.append(SigmoidLayer(units=self.architecture[i]))
self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
self.backend=NetworkBackend(self)
def predict(self,inputs):
return self.backend.activate(inputs)
def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
cost=1
while cost>0.01 and epochs:
prediction=self.predict(X)
cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
print cost
epochs-=1
if __name__=='__main__':
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
layers.py
import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend
class Layer:
def __init__(self,units):
self.units=units
self.backend=ComputationBackend()
def __str__(self):
banner=self.__class__.__name__
banner+=" Units:%d"%self.units
return banner
class SigmoidLayer(Layer):
def forwardPass(self,inputs):
return self.backend.sigmoid(inputs)
class InputLayer(Layer):
def forwardPass(self,inputs):
return inputs
class SoftmaxLayer(Layer):
def forwardPass(self,inputs):
return self.backend.softmax(inputs)
backend.py
import theano
import theano.tensor as T
import numpy
class NetworkBackend:
def __init__(self,network):
# initialize shared variables
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
# activation for network layers
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
class ComputationBackend:
def __init__(self):
# sigmoid activation
self.sigmoid=T.nnet.sigmoid
# softmax activation
self.softmax=T.nnet.softmax
这可能是参数初始化造成的。以下代码示例使用具有单个隐藏层的神经网络实现基本 XOR 学习器。
import numpy
import theano
import theano.tensor as tt
def compile(input_size, hidden_size):
w_h = theano.shared(numpy.random.standard_normal(size=(input_size, hidden_size)).astype(theano.config.floatX))
b_h = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
w_y = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
b_y = theano.shared(numpy.zeros(1, dtype=theano.config.floatX), broadcastable=(True,))
x = tt.matrix()
z = tt.ivector()
learning_rate = tt.scalar()
h = tt.tanh(tt.dot(x, w_h) + b_h)
y = tt.nnet.sigmoid(tt.dot(h, w_y) + b_y)
cost = tt.nnet.binary_crossentropy(y, z).mean()
updates = [(p, p - learning_rate * tt.grad(cost, p)) for p in [w_h, b_h, w_y, b_y]]
return theano.function([x, z, learning_rate], outputs=cost, updates=updates), theano.function([x], outputs=y)
def main():
numpy.random.seed(5)
train, test = compile(2, 2)
for _ in xrange(100000):
print train([[1, 1], [1, 0], [0, 1], [0, 0]], [0, 1, 1, 0], 0.1)
print test([[1, 1], [1, 0], [0, 1], [0, 0]])
main()
记下随机数生成器的种子值。有了 5
的种子,学习器会收敛到一个好的解决方案,并且看起来它正在趋向于一个完美的解决方案,只要有足够的时间。但是,如果将种子更改为 1
,则网络会陷入局部最优;它能够区分第二个维度,但不能区分第一个维度。
不同的随机初始化方法可能会产生更好的结果,即对 RNG 种子不太敏感。
终于明白了!在 NetworkBackend 中,在计算成本时,我正在计算预期输出与作为参数传递给 theano 函数的预测之间的交叉熵,而不是使用 activate 函数计算的预测。因此,theano 图不包含前向传递。因此,theano.tensor.grad 只找到正则化函数的梯度,而不是实际的成本函数!所以正确的实现应该是:
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot
(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].
transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)
label=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(output,label).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate)
for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate
in zip(parameters,rates)]+[(prev_rate,rate)
for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum],
cost,updates=updates)
因此,我没有声明新的预测矩阵,而是使用激活函数中使用的相同方程式获取输入并计算训练函数中的预测。这样就完成了 theano 图,theano.tensor.grad() 现在计算成本函数的梯度以及重新排序。
我用theano写了一个MLP分类器。使用反向传播算法的训练函数如下:
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
我尝试针对 XOR 问题训练分类器。实施是
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
initialize() 方法只编译后端的所有函数,即反向传播函数、用于计算预测的前向传递函数和一些其他 theano 函数。现在,当我 运行 这段代码时,训练稳定在局部最小值。
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
训练开始时,损失约为 0.92。它稳步下降到上述值并停在那里。我尝试改变 alpha 和动量的值。我做错了什么?
P.S。 整个代码在这里: networks.py
import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend
class Network:
def __init__(self,architecture):
self.architecture=architecture
self.layers=[]
self.weights=[]
self.bias=[]
def __str__(self):
banner=''
for i in range(len(self.weights)):
banner+=str(self.weights[i])+'\n'
banner+=str(self.bias[i])+'\n'
return banner
class FeedForwardNetwork(Network):
def initialize(self):
self.layers.append(InputLayer(units=self.architecture[0]))
for i in range(1,len(self.architecture[:-1])):
self.layers.append(SigmoidLayer(units=self.architecture[i]))
self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
self.backend=NetworkBackend(self)
def predict(self,inputs):
return self.backend.activate(inputs)
def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
cost=1
while cost>0.01 and epochs:
prediction=self.predict(X)
cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
print cost
epochs-=1
if __name__=='__main__':
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
layers.py
import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend
class Layer:
def __init__(self,units):
self.units=units
self.backend=ComputationBackend()
def __str__(self):
banner=self.__class__.__name__
banner+=" Units:%d"%self.units
return banner
class SigmoidLayer(Layer):
def forwardPass(self,inputs):
return self.backend.sigmoid(inputs)
class InputLayer(Layer):
def forwardPass(self,inputs):
return inputs
class SoftmaxLayer(Layer):
def forwardPass(self,inputs):
return self.backend.softmax(inputs)
backend.py
import theano
import theano.tensor as T
import numpy
class NetworkBackend:
def __init__(self,network):
# initialize shared variables
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
# activation for network layers
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
class ComputationBackend:
def __init__(self):
# sigmoid activation
self.sigmoid=T.nnet.sigmoid
# softmax activation
self.softmax=T.nnet.softmax
这可能是参数初始化造成的。以下代码示例使用具有单个隐藏层的神经网络实现基本 XOR 学习器。
import numpy
import theano
import theano.tensor as tt
def compile(input_size, hidden_size):
w_h = theano.shared(numpy.random.standard_normal(size=(input_size, hidden_size)).astype(theano.config.floatX))
b_h = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
w_y = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
b_y = theano.shared(numpy.zeros(1, dtype=theano.config.floatX), broadcastable=(True,))
x = tt.matrix()
z = tt.ivector()
learning_rate = tt.scalar()
h = tt.tanh(tt.dot(x, w_h) + b_h)
y = tt.nnet.sigmoid(tt.dot(h, w_y) + b_y)
cost = tt.nnet.binary_crossentropy(y, z).mean()
updates = [(p, p - learning_rate * tt.grad(cost, p)) for p in [w_h, b_h, w_y, b_y]]
return theano.function([x, z, learning_rate], outputs=cost, updates=updates), theano.function([x], outputs=y)
def main():
numpy.random.seed(5)
train, test = compile(2, 2)
for _ in xrange(100000):
print train([[1, 1], [1, 0], [0, 1], [0, 0]], [0, 1, 1, 0], 0.1)
print test([[1, 1], [1, 0], [0, 1], [0, 0]])
main()
记下随机数生成器的种子值。有了 5
的种子,学习器会收敛到一个好的解决方案,并且看起来它正在趋向于一个完美的解决方案,只要有足够的时间。但是,如果将种子更改为 1
,则网络会陷入局部最优;它能够区分第二个维度,但不能区分第一个维度。
不同的随机初始化方法可能会产生更好的结果,即对 RNG 种子不太敏感。
终于明白了!在 NetworkBackend 中,在计算成本时,我正在计算预期输出与作为参数传递给 theano 函数的预测之间的交叉熵,而不是使用 activate 函数计算的预测。因此,theano 图不包含前向传递。因此,theano.tensor.grad 只找到正则化函数的梯度,而不是实际的成本函数!所以正确的实现应该是:
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot
(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].
transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)
label=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(output,label).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate)
for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate
in zip(parameters,rates)]+[(prev_rate,rate)
for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum],
cost,updates=updates)
因此,我没有声明新的预测矩阵,而是使用激活函数中使用的相同方程式获取输入并计算训练函数中的预测。这样就完成了 theano 图,theano.tensor.grad() 现在计算成本函数的梯度以及重新排序。