具有相同数据、相同初始状态、相同递归神经网络的不同损失值
Different loss values with same data, same initial state, same recurrent neural network
我正在编写一个递归神经网络(具体来说,是一个 ConvLSTM)。最近,我注意到一个有趣的矛盾,我不太明白。我使用 numpy(从技术上讲是 gpu 的 cupy)和一些 Chainer 行(专门用于他们的 F.convolution_2D 函数)从头开始编写这个神经网络。
当 运行 同一网络两次时,对于前 4 个左右的训练示例,损失完全相同。然而,在第 5 个训练示例附近,损失的值开始波动。
我确保每次我 运行 这个网络时,它们都从相同的初始状态文本文件中读取(因此具有相同的初始权重和偏差)。我还确保他们输入的数据完全相同。
是否与 Numpy 存在一些不一致是此问题的根源?我认为第 4 个训练示例唯一不同的地方是梯度裁剪的首次使用。 numpy 的 linalg 函数有问题吗?是否存在我不熟悉的舍入误差?我扫描了我的代码,没有使用随机数的实例。
我在下面添加了我的反向传播函数:
def bptt(x2, y2, iteration):
x = cp.asarray(x2)
y = cp.asarray(y2)
global connected_weights
global main_kernel
global bias_i
global bias_f
global bias_c
global bias_o
global bias_y
global learning_rate
# Perform forward prop
prediction, pre_sigmoid_prediction, hidden_prediction, i, f, a, c, o, h = forward_prop(x)
loss = calculate_loss(prediction, y)
print("LOSS BEFORE: ")
print(loss)
# Calculate loss with respect to final layer
dLdy_2 = loss_derivative(prediction, y)
# Calculate loss with respect to pre sigmoid layer
dLdy_1 = cp.multiply(sigmoid_derivative(pre_sigmoid_prediction), dLdy_2)
# Calculate loss with respect to last layer of lstm
dLdh = cp.zeros([T + 1, channels_hidden, M, N])
dLdh[T - 1] = cp.reshape(cp.matmul(cp.transpose(connected_weights), dLdy_1.reshape(1, M * N)), (channels_hidden, M, N)) # reshape dLdh to the appropriate size
dLdw_0 = cp.matmul(dLdy_1.reshape(1, M*N), hidden_prediction.transpose(1,0))
# Calculate loss with respect to bias y
dLdb_y = dLdy_1
#--------------------fully connected------------------
bias_y = bias_y - learning_rate*dLdb_y
connected_weights = connected_weights - learning_rate*dLdw_0
# Initialize corresponding matrices
dLdo = cp.zeros([T, channels_hidden, M, N])
dLdc = cp.zeros([T + 1, channels_hidden, M, N])
dLda = cp.zeros([T, channels_hidden, M, N])
dLdf = cp.zeros([T, channels_hidden, M, N])
dLdi = cp.zeros([T, channels_hidden, M, N])
dLdI = cp.zeros([T, channels_hidden+ channels_img, M, N])
dLdW = cp.zeros([4*channels_hidden, channels_img + channels_hidden, kernel_dimension, kernel_dimension])
# Initialize other stuff
dLdo_hat = cp.zeros([T, channels_hidden, M, N])
dLda_hat = cp.zeros([T, channels_hidden, M, N])
dLdf_hat = cp.zeros([T, channels_hidden, M, N])
dLdi_hat = cp.zeros([T, channels_hidden, M, N])
# initialize biases
dLdb_c = cp.empty([channels_hidden, M, N])
dLdb_i = cp.empty([channels_hidden, M, N])
dLdb_f = cp.empty([channels_hidden, M, N])
dLdb_o = cp.empty([channels_hidden, M, N])
for t in cp.arange(T - 1, -1, -1):
dLdo[t] = cp.multiply(dLdh[t], tanh(c[t]))
dLdc[t] += cp.multiply(cp.multiply(dLdh[t], o[t]), (cp.ones((channels_hidden, M, N)) - cp.multiply(tanh(c[t]), tanh(c[t]))))
dLdi[t] = cp.multiply(dLdc[t], a[t])
dLda[t] = cp.multiply(dLdc[t], i[t])
dLdf[t] = cp.multiply(dLdc[t], c[t - 1])
dLdc[t - 1] = cp.multiply(dLdc[t], f[t])
dLda_hat[t] = cp.multiply(dLda[t], (cp.ones((channels_hidden, M, N)) - cp.multiply(a[t], a[t])))
dLdi_hat[t] = cp.multiply(cp.multiply(dLdi[t], i[t]), cp.ones((channels_hidden, M, N)) - i[t])
dLdf_hat[t] = cp.multiply(cp.multiply(dLdf[t], f[t]), cp.ones((channels_hidden, M, N)) - f[t])
dLdo_hat[t] = cp.multiply(cp.multiply(dLdo[t], o[t]), cp.ones((channels_hidden, M, N)) - o[t])
dLdb_c += dLda_hat[t]
dLdb_i += dLdi_hat[t]
dLdb_f += dLdf_hat[t]
dLdb_o += dLdo_hat[t]
# CONCATENATE Z IN THE RIGHT ORDER SAME ORDER AS THE WEIGHTS
dLdz_hat = cp.concatenate((dLdi_hat[t], dLdf_hat[t], dLda_hat[t], dLdo_hat[t]), axis = 0)
#determine convolution derivatives
#here we will use the fact that in z = w * I, dLdW = dLdz * I
temporary = cp.concatenate((x[t], h[t - 1]), axis=0).reshape(channels_hidden + channels_img, 1, M, N)
dLdI[t] = cp.asarray(F.convolution_2d(dLdz_hat.reshape(1, 4*channels_hidden, M, N), main_kernel.transpose(1, 0, 2, 3), b=None, pad=1)[0].data) # reshape into flipped kernel dimensions
dLdW += cp.asarray((F.convolution_2d(temporary, dLdz_hat.reshape(4*channels_hidden, 1, M, N), b=None, pad=1).data).transpose(1,0,2,3)) #reshape into kernel dimensions
#gradient clipping
if cp.amax(dLdW) > 1 or cp.amin(dLdW) < -1:
dLdW = dLdW/cp.linalg.norm(dLdW)
if cp.amax(dLdb_c) > 1 or cp.amin(dLdb_c) < -1:
dLdb_c = dLdb_c/cp.linalg.norm(dLdb_c)
if cp.amax(dLdb_i) > 1 or cp.amin(dLdb_i) < -1:
dLdb_i = dLdb_i/cp.linalg.norm(dLdb_i)
if cp.amax(dLdb_f) > 1 or cp.amin(dLdb_f) < -1:
dLdb_f = dLdb_f/cp.linalg.norm(dLdb_f)
if cp.amax(dLdb_o) > 1 or cp.amin(dLdb_o) < -1:
dLdb_o = dLdb_o/cp.linalg.norm(dLdb_o)
if cp.amax(dLdw_0) > 1 or cp.amin(dLdw_0) < -1:
dLdw_0 = dLdw_0/cp.linalg.norm(dLdw_0)
if cp.amax(dLdb_y) > 1 or cp.amin(dLdb_y) < -1:
dLdb_y = dLdb_y/cp.linalg.norm(dLdb_y)
print("dLdW on step: " + str(t) + " is this: " + str(dLdW[0][0][0][0]))
#print("dLdw_0")
#print("dLdW")
#print(dLdW)
#print(str(cp.amax(dLdw_0)) + " : " + str(cp.amin(dLdw_0)))
#print("dLdW")
#print(str(cp.amax(dLdW)) + " : " + str(cp.amin(dLdW)))
#print("dLdb_c")
#print(str(cp.amax(dLdb_c)) + " : " + str(cp.amin(dLdb_c)))
dLdh[t-1] = dLdI[t][channels_img: channels_img+channels_hidden]
#.reshape(4*channels_hidden, channels_hidden+channels_img, kernel_dimension, kernel_dimension)
#update weights with convolution derivatives
#----------------------------adam optimizer code-----------------------------------
#---------------------update main kernel---------
main_kernel = main_kernel - learning_rate*dLdW
#--------------------update bias c-----------------------
bias_c = bias_c - learning_rate*dLdb_c
#--------------------update bias i-----------------------
bias_i = bias_i - learning_rate*dLdb_i
#--------------------update bias f-----------------------
bias_f = bias_f - learning_rate*dLdb_f
#--------------------update bias c-----------------------
bias_o = bias_o - learning_rate*dLdb_o
prediction2, pre_sigmoid_prediction2, hidden_prediction2, i2, f2, a2, c2, o2, h2 = forward_prop(x)
print("dLdW is: " + str(dLdW[0][0][0][0]))
loss2 = calculate_loss(prediction2, y)
print("LOSS AFTER: ")
print(loss2)
print("backpropagation complete")
哇,这花了一些时间。
如果您查看反向传播代码,请仔细查看以下行:
dLdb_c = cp.empty([channels_hidden, M, N])
dLdb_i = cp.empty([channels_hidden, M, N])
dLdb_f = cp.empty([channels_hidden, M, N])
dLdb_o = cp.empty([channels_hidden, M, N])
但是,请注意代码如何继续在这些空数组上使用 += 运算符。只需将数组更改为 cp.zeros,代码就会给出一致的损失。
我正在编写一个递归神经网络(具体来说,是一个 ConvLSTM)。最近,我注意到一个有趣的矛盾,我不太明白。我使用 numpy(从技术上讲是 gpu 的 cupy)和一些 Chainer 行(专门用于他们的 F.convolution_2D 函数)从头开始编写这个神经网络。
当 运行 同一网络两次时,对于前 4 个左右的训练示例,损失完全相同。然而,在第 5 个训练示例附近,损失的值开始波动。
我确保每次我 运行 这个网络时,它们都从相同的初始状态文本文件中读取(因此具有相同的初始权重和偏差)。我还确保他们输入的数据完全相同。
是否与 Numpy 存在一些不一致是此问题的根源?我认为第 4 个训练示例唯一不同的地方是梯度裁剪的首次使用。 numpy 的 linalg 函数有问题吗?是否存在我不熟悉的舍入误差?我扫描了我的代码,没有使用随机数的实例。
我在下面添加了我的反向传播函数:
def bptt(x2, y2, iteration):
x = cp.asarray(x2)
y = cp.asarray(y2)
global connected_weights
global main_kernel
global bias_i
global bias_f
global bias_c
global bias_o
global bias_y
global learning_rate
# Perform forward prop
prediction, pre_sigmoid_prediction, hidden_prediction, i, f, a, c, o, h = forward_prop(x)
loss = calculate_loss(prediction, y)
print("LOSS BEFORE: ")
print(loss)
# Calculate loss with respect to final layer
dLdy_2 = loss_derivative(prediction, y)
# Calculate loss with respect to pre sigmoid layer
dLdy_1 = cp.multiply(sigmoid_derivative(pre_sigmoid_prediction), dLdy_2)
# Calculate loss with respect to last layer of lstm
dLdh = cp.zeros([T + 1, channels_hidden, M, N])
dLdh[T - 1] = cp.reshape(cp.matmul(cp.transpose(connected_weights), dLdy_1.reshape(1, M * N)), (channels_hidden, M, N)) # reshape dLdh to the appropriate size
dLdw_0 = cp.matmul(dLdy_1.reshape(1, M*N), hidden_prediction.transpose(1,0))
# Calculate loss with respect to bias y
dLdb_y = dLdy_1
#--------------------fully connected------------------
bias_y = bias_y - learning_rate*dLdb_y
connected_weights = connected_weights - learning_rate*dLdw_0
# Initialize corresponding matrices
dLdo = cp.zeros([T, channels_hidden, M, N])
dLdc = cp.zeros([T + 1, channels_hidden, M, N])
dLda = cp.zeros([T, channels_hidden, M, N])
dLdf = cp.zeros([T, channels_hidden, M, N])
dLdi = cp.zeros([T, channels_hidden, M, N])
dLdI = cp.zeros([T, channels_hidden+ channels_img, M, N])
dLdW = cp.zeros([4*channels_hidden, channels_img + channels_hidden, kernel_dimension, kernel_dimension])
# Initialize other stuff
dLdo_hat = cp.zeros([T, channels_hidden, M, N])
dLda_hat = cp.zeros([T, channels_hidden, M, N])
dLdf_hat = cp.zeros([T, channels_hidden, M, N])
dLdi_hat = cp.zeros([T, channels_hidden, M, N])
# initialize biases
dLdb_c = cp.empty([channels_hidden, M, N])
dLdb_i = cp.empty([channels_hidden, M, N])
dLdb_f = cp.empty([channels_hidden, M, N])
dLdb_o = cp.empty([channels_hidden, M, N])
for t in cp.arange(T - 1, -1, -1):
dLdo[t] = cp.multiply(dLdh[t], tanh(c[t]))
dLdc[t] += cp.multiply(cp.multiply(dLdh[t], o[t]), (cp.ones((channels_hidden, M, N)) - cp.multiply(tanh(c[t]), tanh(c[t]))))
dLdi[t] = cp.multiply(dLdc[t], a[t])
dLda[t] = cp.multiply(dLdc[t], i[t])
dLdf[t] = cp.multiply(dLdc[t], c[t - 1])
dLdc[t - 1] = cp.multiply(dLdc[t], f[t])
dLda_hat[t] = cp.multiply(dLda[t], (cp.ones((channels_hidden, M, N)) - cp.multiply(a[t], a[t])))
dLdi_hat[t] = cp.multiply(cp.multiply(dLdi[t], i[t]), cp.ones((channels_hidden, M, N)) - i[t])
dLdf_hat[t] = cp.multiply(cp.multiply(dLdf[t], f[t]), cp.ones((channels_hidden, M, N)) - f[t])
dLdo_hat[t] = cp.multiply(cp.multiply(dLdo[t], o[t]), cp.ones((channels_hidden, M, N)) - o[t])
dLdb_c += dLda_hat[t]
dLdb_i += dLdi_hat[t]
dLdb_f += dLdf_hat[t]
dLdb_o += dLdo_hat[t]
# CONCATENATE Z IN THE RIGHT ORDER SAME ORDER AS THE WEIGHTS
dLdz_hat = cp.concatenate((dLdi_hat[t], dLdf_hat[t], dLda_hat[t], dLdo_hat[t]), axis = 0)
#determine convolution derivatives
#here we will use the fact that in z = w * I, dLdW = dLdz * I
temporary = cp.concatenate((x[t], h[t - 1]), axis=0).reshape(channels_hidden + channels_img, 1, M, N)
dLdI[t] = cp.asarray(F.convolution_2d(dLdz_hat.reshape(1, 4*channels_hidden, M, N), main_kernel.transpose(1, 0, 2, 3), b=None, pad=1)[0].data) # reshape into flipped kernel dimensions
dLdW += cp.asarray((F.convolution_2d(temporary, dLdz_hat.reshape(4*channels_hidden, 1, M, N), b=None, pad=1).data).transpose(1,0,2,3)) #reshape into kernel dimensions
#gradient clipping
if cp.amax(dLdW) > 1 or cp.amin(dLdW) < -1:
dLdW = dLdW/cp.linalg.norm(dLdW)
if cp.amax(dLdb_c) > 1 or cp.amin(dLdb_c) < -1:
dLdb_c = dLdb_c/cp.linalg.norm(dLdb_c)
if cp.amax(dLdb_i) > 1 or cp.amin(dLdb_i) < -1:
dLdb_i = dLdb_i/cp.linalg.norm(dLdb_i)
if cp.amax(dLdb_f) > 1 or cp.amin(dLdb_f) < -1:
dLdb_f = dLdb_f/cp.linalg.norm(dLdb_f)
if cp.amax(dLdb_o) > 1 or cp.amin(dLdb_o) < -1:
dLdb_o = dLdb_o/cp.linalg.norm(dLdb_o)
if cp.amax(dLdw_0) > 1 or cp.amin(dLdw_0) < -1:
dLdw_0 = dLdw_0/cp.linalg.norm(dLdw_0)
if cp.amax(dLdb_y) > 1 or cp.amin(dLdb_y) < -1:
dLdb_y = dLdb_y/cp.linalg.norm(dLdb_y)
print("dLdW on step: " + str(t) + " is this: " + str(dLdW[0][0][0][0]))
#print("dLdw_0")
#print("dLdW")
#print(dLdW)
#print(str(cp.amax(dLdw_0)) + " : " + str(cp.amin(dLdw_0)))
#print("dLdW")
#print(str(cp.amax(dLdW)) + " : " + str(cp.amin(dLdW)))
#print("dLdb_c")
#print(str(cp.amax(dLdb_c)) + " : " + str(cp.amin(dLdb_c)))
dLdh[t-1] = dLdI[t][channels_img: channels_img+channels_hidden]
#.reshape(4*channels_hidden, channels_hidden+channels_img, kernel_dimension, kernel_dimension)
#update weights with convolution derivatives
#----------------------------adam optimizer code-----------------------------------
#---------------------update main kernel---------
main_kernel = main_kernel - learning_rate*dLdW
#--------------------update bias c-----------------------
bias_c = bias_c - learning_rate*dLdb_c
#--------------------update bias i-----------------------
bias_i = bias_i - learning_rate*dLdb_i
#--------------------update bias f-----------------------
bias_f = bias_f - learning_rate*dLdb_f
#--------------------update bias c-----------------------
bias_o = bias_o - learning_rate*dLdb_o
prediction2, pre_sigmoid_prediction2, hidden_prediction2, i2, f2, a2, c2, o2, h2 = forward_prop(x)
print("dLdW is: " + str(dLdW[0][0][0][0]))
loss2 = calculate_loss(prediction2, y)
print("LOSS AFTER: ")
print(loss2)
print("backpropagation complete")
哇,这花了一些时间。
如果您查看反向传播代码,请仔细查看以下行:
dLdb_c = cp.empty([channels_hidden, M, N])
dLdb_i = cp.empty([channels_hidden, M, N])
dLdb_f = cp.empty([channels_hidden, M, N])
dLdb_o = cp.empty([channels_hidden, M, N])
但是,请注意代码如何继续在这些空数组上使用 += 运算符。只需将数组更改为 cp.zeros,代码就会给出一致的损失。