对于深度学习,使用激活 relu,输出在训练期间变为 NAN,而使用 tanh 则正常
For deep learning, With activation relu the output becomes NAN during training while is normal with tanh
我训练的神经网络是深度强化学习的critic network。问题是当其中一个层的激活设置为 relu 或 elu 时,经过一些训练步骤后输出将是 nan,而如果激活是 tanh,则输出是正常的。代码如下(基于tensorflow):
with tf.variable_scope('critic'):
self.batch_size = tf.shape(self.tfs)[0]
l_out_x = denseWN(x=self.tfs, name='l3', num_units=self.cell_size, nonlinearity=tf.nn.tanh, trainable=True,shape=[det*step*2, self.cell_size])
l_out_x1 = denseWN(x=l_out_x, name='l3_1', num_units=32, trainable=True,nonlinearity=tf.nn.tanh, shape=[self.cell_size, 32])
l_out_x2 = denseWN(x=l_out_x1, name='l3_2', num_units=32, trainable=True,nonlinearity=tf.nn.tanh,shape=[32, 32])
l_out_x3 = denseWN(x=l_out_x2, name='l3_3', num_units=32, trainable=True,shape=[32, 32])
self.v = denseWN(x=l_out_x3, name='l4', num_units=1, trainable=True, shape=[32, 1])
基础层构建代码如下:
def get_var_maybe_avg(var_name, ema, trainable, shape):
if var_name=='V':
initializer = tf.contrib.layers.xavier_initializer()
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=shape)
if var_name=='g':
initializer = tf.constant_initializer(1.0)
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=[shape[-1]])
if var_name=='b':
initializer = tf.constant_initializer(0.1)
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=[shape[-1]])
if ema is not None:
v = ema.average(v)
return v
def get_vars_maybe_avg(var_names, ema, trainable, shape):
vars=[]
for vn in var_names:
vars.append(get_var_maybe_avg(vn, ema, trainable=trainable, shape=shape))
return vars
def denseWN(x, name, num_units, trainable, shape, nonlinearity=None, ema=None, **kwargs):
with tf.variable_scope(name):
V, g, b = get_vars_maybe_avg(['V', 'g', 'b'], ema, trainable=trainable, shape=shape)
x = tf.matmul(x, V)
scaler = g/tf.sqrt(tf.reduce_sum(tf.square(V),[0]))
x = tf.reshape(scaler,[1,num_units])*x + tf.reshape(b,[1,num_units])
if nonlinearity is not None:
x = nonlinearity(x)
return x
这里是训练网络的代码:
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
l1_regularizer = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None)
self.weights = tf.trainable_variables()
regularization_penalty_critic = tf.contrib.layers.apply_regularization(l1_regularizer, self.weights)
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.optimizer = tf.train.RMSPropOptimizer(0.0001, 0.99, 0.0, 1e-6)
self.grads_and_vars = self.optimizer.compute_gradients(self.closs)
self.grads_and_vars = [[tf.clip_by_norm(grad,5), var] for grad, var in self.grads_and_vars if grad is not None]
self.ctrain_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step())
看起来你正面临着 ReLu 激活函数梯度爆炸的问题(NaN
意味着 - 非常大的激活)。有几种技术可以解决这个问题,例如batch normalization(改变网络架构)或微妙的变量初始化(这是我首先尝试的)。
您正在对不同层中的 V
变量使用 Xavier 初始化,这确实适用于逻辑 sigmoid 激活(参见 the paper by Xavier Glorot and Yoshua Bengio),或者换句话说,tanh
。
ReLU 激活函数(及其变体,包括 ELU)的首选初始化策略是 He 初始化。在 tensorflow 中,它是通过 tf.variance_scaling_initializer
:
实现的
initializer = tf.variance_scaling_initializer()
v = tf.get_variable(name=var_name, initializer=initializer, ...)
您可能还想为 b
和 g
变量尝试较小的值,但仅通过查看您的模型很难说出确切的值。如果没有任何帮助,请考虑向您的模型添加 batch-norm 层以控制激活分布。
我训练的神经网络是深度强化学习的critic network。问题是当其中一个层的激活设置为 relu 或 elu 时,经过一些训练步骤后输出将是 nan,而如果激活是 tanh,则输出是正常的。代码如下(基于tensorflow):
with tf.variable_scope('critic'):
self.batch_size = tf.shape(self.tfs)[0]
l_out_x = denseWN(x=self.tfs, name='l3', num_units=self.cell_size, nonlinearity=tf.nn.tanh, trainable=True,shape=[det*step*2, self.cell_size])
l_out_x1 = denseWN(x=l_out_x, name='l3_1', num_units=32, trainable=True,nonlinearity=tf.nn.tanh, shape=[self.cell_size, 32])
l_out_x2 = denseWN(x=l_out_x1, name='l3_2', num_units=32, trainable=True,nonlinearity=tf.nn.tanh,shape=[32, 32])
l_out_x3 = denseWN(x=l_out_x2, name='l3_3', num_units=32, trainable=True,shape=[32, 32])
self.v = denseWN(x=l_out_x3, name='l4', num_units=1, trainable=True, shape=[32, 1])
基础层构建代码如下:
def get_var_maybe_avg(var_name, ema, trainable, shape):
if var_name=='V':
initializer = tf.contrib.layers.xavier_initializer()
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=shape)
if var_name=='g':
initializer = tf.constant_initializer(1.0)
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=[shape[-1]])
if var_name=='b':
initializer = tf.constant_initializer(0.1)
v = tf.get_variable(name=var_name, initializer=initializer, trainable=trainable, shape=[shape[-1]])
if ema is not None:
v = ema.average(v)
return v
def get_vars_maybe_avg(var_names, ema, trainable, shape):
vars=[]
for vn in var_names:
vars.append(get_var_maybe_avg(vn, ema, trainable=trainable, shape=shape))
return vars
def denseWN(x, name, num_units, trainable, shape, nonlinearity=None, ema=None, **kwargs):
with tf.variable_scope(name):
V, g, b = get_vars_maybe_avg(['V', 'g', 'b'], ema, trainable=trainable, shape=shape)
x = tf.matmul(x, V)
scaler = g/tf.sqrt(tf.reduce_sum(tf.square(V),[0]))
x = tf.reshape(scaler,[1,num_units])*x + tf.reshape(b,[1,num_units])
if nonlinearity is not None:
x = nonlinearity(x)
return x
这里是训练网络的代码:
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
l1_regularizer = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None)
self.weights = tf.trainable_variables()
regularization_penalty_critic = tf.contrib.layers.apply_regularization(l1_regularizer, self.weights)
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.optimizer = tf.train.RMSPropOptimizer(0.0001, 0.99, 0.0, 1e-6)
self.grads_and_vars = self.optimizer.compute_gradients(self.closs)
self.grads_and_vars = [[tf.clip_by_norm(grad,5), var] for grad, var in self.grads_and_vars if grad is not None]
self.ctrain_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step())
看起来你正面临着 ReLu 激活函数梯度爆炸的问题(NaN
意味着 - 非常大的激活)。有几种技术可以解决这个问题,例如batch normalization(改变网络架构)或微妙的变量初始化(这是我首先尝试的)。
您正在对不同层中的 V
变量使用 Xavier 初始化,这确实适用于逻辑 sigmoid 激活(参见 the paper by Xavier Glorot and Yoshua Bengio),或者换句话说,tanh
。
ReLU 激活函数(及其变体,包括 ELU)的首选初始化策略是 He 初始化。在 tensorflow 中,它是通过 tf.variance_scaling_initializer
:
initializer = tf.variance_scaling_initializer()
v = tf.get_variable(name=var_name, initializer=initializer, ...)
您可能还想为 b
和 g
变量尝试较小的值,但仅通过查看您的模型很难说出确切的值。如果没有任何帮助,请考虑向您的模型添加 batch-norm 层以控制激活分布。