Theano 中的 MLP 分类器稳定在局部最小值

MLP classifier in theano settles at local minima

我用theano写了一个MLP分类器。使用反向传播算法的训练函数如下:

self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
    cost+=T.sum(i**2)*reg_lambda
    cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)

我尝试针对 XOR 问题训练分类器。实施是

network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))

initialize() 方法只编译后端的所有函数,即反向传播函数、用于计算预测的前向传递函数和一些其他 theano 函数。现在,当我 运行 这段代码时,训练稳定在局部最小值。

0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056

训练开始时,损失约为 0.92。它稳步下降到上述值并停在那里。我尝试改变 alpha 和动量的值。我做错了什么?

P.S。 整个代码在这里: networks.py

import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend

class Network:

    def __init__(self,architecture):
        self.architecture=architecture
        self.layers=[]
        self.weights=[]
        self.bias=[]

    def __str__(self):
        banner=''
        for i in range(len(self.weights)):
            banner+=str(self.weights[i])+'\n'
            banner+=str(self.bias[i])+'\n'
        return banner

class FeedForwardNetwork(Network):

    def initialize(self):
        self.layers.append(InputLayer(units=self.architecture[0]))
        for i in range(1,len(self.architecture[:-1])):
            self.layers.append(SigmoidLayer(units=self.architecture[i]))
        self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
        self.backend=NetworkBackend(self)

    def predict(self,inputs):
        return self.backend.activate(inputs)

    def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
        cost=1
        while cost>0.01 and epochs:
            prediction=self.predict(X)
            cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
            print cost
            epochs-=1


if __name__=='__main__':
    network=FeedForwardNetwork([2,2,2])
    network.initialize()
    network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
    print network.predict(numpy.array([[1.,0.]]))
    print network.predict(numpy.array([[0.,0.]]))

layers.py

import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend

class Layer:

    def __init__(self,units):
        self.units=units
        self.backend=ComputationBackend()

    def __str__(self):
        banner=self.__class__.__name__
        banner+=" Units:%d"%self.units
        return banner

class SigmoidLayer(Layer):

    def forwardPass(self,inputs):
        return self.backend.sigmoid(inputs)


class InputLayer(Layer):

    def forwardPass(self,inputs):
        return inputs

class SoftmaxLayer(Layer):

    def forwardPass(self,inputs):
        return self.backend.softmax(inputs)

backend.py

import theano
import theano.tensor as T
import numpy

class NetworkBackend:

    def __init__(self,network):

        # initialize shared variables
        self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
        self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
        self.layers=network.layers
        self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]

        # activation for network layers
        inputs=T.dmatrix()
        temp=self.layers[0].forwardPass(inputs)
        for i in range(1,len(self.layers[:-1])):
            temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
        output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
        self.activate=theano.function([inputs],output)

        prediction=T.dmatrix()
        output=T.dmatrix()
        reg_lambda=T.dscalar()
        alpha=T.dscalar()
        momentum=T.dscalar()
        cost=T.nnet.categorical_crossentropy(prediction,output).mean()
        for i,j in zip(self.weights,self.bias):
            cost+=T.sum(i**2)*reg_lambda
            cost+=T.sum(j**2)*reg_lambda
        parameters=self.weights+self.bias
        rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
        updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
        self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)


class ComputationBackend:

    def __init__(self):

        # sigmoid activation
        self.sigmoid=T.nnet.sigmoid

        # softmax activation
        self.softmax=T.nnet.softmax

这可能是参数初始化造成的。以下代码示例使用具有单个隐藏层的神经网络实现基本 XOR 学习器。

import numpy
import theano
import theano.tensor as tt


def compile(input_size, hidden_size):
    w_h = theano.shared(numpy.random.standard_normal(size=(input_size, hidden_size)).astype(theano.config.floatX))
    b_h = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
    w_y = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
    b_y = theano.shared(numpy.zeros(1, dtype=theano.config.floatX), broadcastable=(True,))
    x = tt.matrix()
    z = tt.ivector()
    learning_rate = tt.scalar()
    h = tt.tanh(tt.dot(x, w_h) + b_h)
    y = tt.nnet.sigmoid(tt.dot(h, w_y) + b_y)
    cost = tt.nnet.binary_crossentropy(y, z).mean()
    updates = [(p, p - learning_rate * tt.grad(cost, p)) for p in [w_h, b_h, w_y, b_y]]
    return theano.function([x, z, learning_rate], outputs=cost, updates=updates), theano.function([x], outputs=y)


def main():
    numpy.random.seed(5)
    train, test = compile(2, 2)
    for _ in xrange(100000):
        print train([[1, 1], [1, 0], [0, 1], [0, 0]], [0, 1, 1, 0], 0.1)
    print test([[1, 1], [1, 0], [0, 1], [0, 0]])


main()

记下随机数生成器的种子值。有了 5 的种子,学习器会收敛到一个好的解决方案,并且看起来它正在趋向于一个完美的解决方案,只要有足够的时间。但是,如果将种子更改为 1,则网络会陷入局部最优;它能够区分第二个维度,但不能区分第一个维度。

不同的随机初始化方法可能会产生更好的结果,即对 RNG 种子不太敏感。

终于明白了!在 NetworkBackend 中,在计算成本时,我正在计算预期输出与作为参数传递给 theano 函数的预测之间的交叉熵,而不是使用 activate 函数计算的预测。因此,theano 图不包含前向传递。因此,theano.tensor.grad 只找到正则化函数的梯度,而不是实际的成本函数!所以正确的实现应该是:

inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
    temp=self.layers[i].forwardPass(T.dot
    (temp,self.weights[i-1].transpose())+self.bias[i-1])
    output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].
    transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)

label=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(output,label).mean()
for i,j in zip(self.weights,self.bias):
    cost+=T.sum(i**2)*reg_lambda
    cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) 
for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate
 in zip(parameters,rates)]+[(prev_rate,rate) 
for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum],
cost,updates=updates)

因此,我没有声明新的预测矩阵,而是使用激活函数中使用的相同方程式获取输入并计算训练函数中的预测。这样就完成了 theano 图,theano.tensor.grad() 现在计算成本函数的梯度以及重新排序。