Roi 池化和反向传播

Roi pooling and backpropagation

我已经在我的图表上实现了 ROI pooling。代码如下

def __init__(self,fatness,image_shape, vocab, r_vocab, num_classes,rnn_cells_num):
           CTCUtils.vocab = vocab
           CTCUtils.r_vocab = r_vocab
           self.global_step = tf.Variable(0, name='global_step', trainable=False)
           self.input_labels=tf.placeholder(dtype=tf.string, shape=(config.train.input_labels_size,))
           self.input_dat = tf.placeholder(dtype=tf.float32, shape=(None,config.train.extracted_feature_height,config.train.extracted_feature_width,512))
           self.in_boxes = tf.placeholder(dtype=tf.float32, shape=(config.train.input_labels_size,5))
           self.num_classes = num_classes
           self.rnn_cells_num = rnn_cells_num
           self.saver = tf.train.Saver()
           self.poolheight=1
           self.poolwidth=32
           self.sess = tf.Session(graph = tf.get_default_graph())
           with slim.arg_scope([slim.conv2d, slim.max_pool2d]): 
           ########################################################
           #########CONV layers before ROI pooling#################
           ########################################################
           net = slim.repeat(self.input_dat, 4, slim.conv2d, fatness, [3, 3], padding='SAME',scope='conv6',weights_regularizer=slim.l2_regularizer(config.weight_decay),weights_initializer=tf.contrib.layers.xavier_initializer(),biases_initializer = tf.zeros_initializer(),activation_fn=tf.nn.relu)   

           self.in_boxes=tf.dtypes.cast(self.in_boxes,tf.int32)
           ########################################################
           #######ROI pooling layer################################
           ########################################################                         
           rec_fmap_clone = roi_pooling(net, self.in_boxes, pool_height=self.poolheight, pool_width=self.poolwidth) #shape is (1, 20, 256, 1, 32)

           decision=(rec_fmap_clone.get_shape()==None)
           if (decision==False):
              self.rec_fmap = tf.identity(rec_fmap_clone)
              shape=np.shape(self.rec_fmap)
              self.rec_fmap=np.reshape(self.rec_fmap, (shape[1],shape[2],shape[3],shape[4]))
              self.rec_fmap=tf.transpose(self.rec_fmap, perm=[0, 2, 3, 1])
           else:
              self.rec_fmap=tf.ones([config.train.input_labels_size, 1, 32, 256], tf.float32)
           with slim.arg_scope([slim.conv2d],normalizer_fn=slim.batch_norm,weights_initializer=tf.truncated_normal_initializer(stddev=0.01),weights_regularizer=slim.l2_regularizer(0.0005)):
          classes = slim.conv2d(self.rec_fmap, self.num_classes, [1, 13])
          pattern = slim.fully_connected(slim.flatten(classes), self.rnn_cells_num)  # patterns number

          width = int(self.rec_fmap.get_shape()[2])
          pattern = tf.reshape(pattern, (-1, 1, 1, self.rnn_cells_num))
          pattern = tf.tile(pattern, [1, 1, width, 1])
          inf = tf.concat(axis=3, values=[classes, pattern])  # skip connection over RNN
          inf = slim.conv2d(inf, self.num_classes, [1, 1], normalizer_fn=None,activation_fn=None)  # fully convolutional linear activation
          inf = tf.squeeze(inf, [1])
          prob = tf.transpose(inf, (1, 0, 2))  # prepare for CTC
          data_length = tf.fill([tf.shape(prob)[1]], tf.shape(prob)[0])  # input seq length, batch size
          ctc = tf.py_func(CTCUtils.compute_ctc_from_labels, [self.input_labels], [tf.int64, tf.int64, tf.int64])
          ctc_labels = tf.to_int32(tf.SparseTensor(ctc[0], ctc[1], ctc[2]))
          predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(prob, data_length, merge_repeated=False, beam_width=10)[0][0])
          tf.sparse_tensor_to_dense(predictions, default_value=-1, name='d_predictions')
          tf.reduce_mean(tf.edit_distance(predictions, ctc_labels, normalize=False), name='error_rate')
          self.loss = tf.reduce_mean(tf.compat.v1.nn.ctc_loss(inputs=prob, labels=ctc_labels, sequence_length=data_length, ctc_merge_repeated=True), name='loss')
          self.learning_rate = tf.train.piecewise_constant(self.global_step, [150000, 200000],[config.train.learning_rate, 0.1 * config.train.learning_rate,0.01 * config.train.learning_rate])
          self.opt_loss = tf.contrib.layers.optimize_loss(self.loss, self.global_step, self.learning_rate, config.train.opt_type, config.train.grad_noise_scale, name='train_step')

          self.sess.run(tf.global_variables_initializer())

该图在ROI pooling之前有几个卷积层,ctc loss用于优化。

关注点是ROI pooling之前的卷积层在反向传播中是否优化。

这里根据discussionROI pooling layer本身是可微的

但是在张量板中绘制图形时,图形在 ROI 池化层之后断开连接。

如何检查并确保 conv layers before ROI pooling 已在优化中更新?

通过在 RoiPooling 之后放置 conv 层解决了问题。

第一张图仅用于使用 RoiPooling 进行特征提取。 RoiPooling 输出大小设置为更大的尺寸。然后将这些输出用作第二张图的输入。放置了conv layers。这样我就有了优化的权重。

修改后的图如下所示