如何在 Tensorflow 中实现反转梯度?
How to implement inverting gradient in Tensorflow?
我正在尝试在 Tensorflow 中实施 DDPG。动作 space 与上限 P_max
和下限 P_min
连续。基于 this paper,反转梯度是连续动作的好方法 space。但是,更新演员网络时我被卡住了。我将在下面介绍我的代码。
首先,我为状态 next_state 和奖励构建占位符。其中 S_DIM
是状态维度。
self.S = tf.placeholder(tf.float32, [None, S_DIM], name='state')
self.S_ = tf.placeholder(tf.float32, [None, S_DIM], name='next_state')
self.R = tf.placeholder(tf.float32, [None, 1], name='reward')
为演员和评论家构建神经网络,其中 A_DIM
是动作 space:
def build_a(self, s, scope, trainable):
with tf.variable_scope('actor'):
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 100, tf.nn.relu,trainable=trainable)
a = tf.layers.dense(l1, A_DIM, trainable=trainable)
return a
def build_c(self, s, a, scope, trainable):
with tf.variable_scope('critic'):
with tf.variable_scope(scope):
concat_layer = tf.concat([s, a], axis=1)
l1 = tf.layers.dense(concat_layer, 100, tf.nn.relu, trainable=trainable)
q = tf.layers.dense(l1, 1, trainable=trainable)
return q
self.a = self.build_a(self.S, scope='evaluation', trainable=True)
self.a_ = self.build_a(self.S_, scope='target', trainable=False)
self.q = self.build_c(self.S, self.a, scope='evaluation', trainable=True)
self.q_ = self.build_c(self.S_, a_, scope='target', trainable=False)
访问神经网络中的参数供以后使用:
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/evaluation')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/evaluation')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/target')
然后,通过最小化 q_target
和 q
之间的差异,用时差 Bellman 方程更新 critic。 GAMMA
为折扣系数,例如:0.99
q_target = self.R + GAMMA * q_
self.c_loss = tf.losses.mean_squared_error(q_target, self.q_)
self.ctrain = tf.train.AdamOptimizer(0.001).minimize(self.c_loss, var_list=self.ce_params)
最后,更新演员(我卡住了):
dq_da = tf.gradients(q, self.a)[0] # partial Q, partial a
upper_method = lambda: dq_da * (upper - self.a) / (upper - lower)
lower_method = lambda: dq_da * (self.a - lower) / (upper - lower)
# if gradient suggests increasing action, apply upper method
# else, lower method
adjust_dq_da = tf.cond(tf.greater(dq_da, 0), upper_method, lower_method)
grad = tf.gradients(self.a, self.ae_params, grad_ys=adjust_dq_da)
# apply gradient to the parameters in actor network
self.atrain = tf.train.AdamOptimizer(-0.0001).apply_gradients(zip(grad, self.ae_params))
我得到错误:
ValueError: Shape must be rank 0 but is rank 2 for 'actor_gradient/cond/Switch' (op: 'Switch') with input shapes: [?,1], [?,1].
有什么办法可以改善吗?
最后,我通过创建一个用于反转渐变的占位符来解决我的问题。
inverting_gradients_placeholder = tf.placeholder(tf.float32, shape=[None, 1], name='inverting_gradeints')
第一步,我计算 Q
值 w.r.t 动作 a
的梯度。
dq_da = tf.gradient(q, a)[0]
其次,我采用动作梯度a
w.r.t actor参数a_params
。同时定义actor更新操作。
grad = tf.gradients(a, a_params, grad_ys=inverting_gradients_placeholder)
train_actor = tf.train.AdamOptimizer(learning_rate_actor).apply_gradients(zip(grad, a_params))
最后,实现反转梯度。
# get dq/da array, action array
dq_das, actions = sess.run([dq_da, a], feed_dict={state_placeholder: batch_state})
# inverting gradients, if dq_da >= 0, apply upper method, else lower method
inverting_gradients = []
for dq_da, action in zip(dq_das, actions):
if dq_da >= 0.0:
inverting_gradients.append(dq_da * (upper - action) / (upper - lower))
else:
inverting_gradients.append(dq_da * (action - lower) / (upper - lower))
inverting_gradients = np.array(inverting_gradients).reshape(-1, 1)
# update actor
sess.run(train_actor, feed_dict={state_placeholder: batch_state, inverting_gradients_placeholder: inverting_gradients})
我正在尝试在 Tensorflow 中实施 DDPG。动作 space 与上限 P_max
和下限 P_min
连续。基于 this paper,反转梯度是连续动作的好方法 space。但是,更新演员网络时我被卡住了。我将在下面介绍我的代码。
首先,我为状态 next_state 和奖励构建占位符。其中 S_DIM
是状态维度。
self.S = tf.placeholder(tf.float32, [None, S_DIM], name='state')
self.S_ = tf.placeholder(tf.float32, [None, S_DIM], name='next_state')
self.R = tf.placeholder(tf.float32, [None, 1], name='reward')
为演员和评论家构建神经网络,其中 A_DIM
是动作 space:
def build_a(self, s, scope, trainable):
with tf.variable_scope('actor'):
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 100, tf.nn.relu,trainable=trainable)
a = tf.layers.dense(l1, A_DIM, trainable=trainable)
return a
def build_c(self, s, a, scope, trainable):
with tf.variable_scope('critic'):
with tf.variable_scope(scope):
concat_layer = tf.concat([s, a], axis=1)
l1 = tf.layers.dense(concat_layer, 100, tf.nn.relu, trainable=trainable)
q = tf.layers.dense(l1, 1, trainable=trainable)
return q
self.a = self.build_a(self.S, scope='evaluation', trainable=True)
self.a_ = self.build_a(self.S_, scope='target', trainable=False)
self.q = self.build_c(self.S, self.a, scope='evaluation', trainable=True)
self.q_ = self.build_c(self.S_, a_, scope='target', trainable=False)
访问神经网络中的参数供以后使用:
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/evaluation')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/evaluation')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic/target')
然后,通过最小化 q_target
和 q
之间的差异,用时差 Bellman 方程更新 critic。 GAMMA
为折扣系数,例如:0.99
q_target = self.R + GAMMA * q_
self.c_loss = tf.losses.mean_squared_error(q_target, self.q_)
self.ctrain = tf.train.AdamOptimizer(0.001).minimize(self.c_loss, var_list=self.ce_params)
最后,更新演员(我卡住了):
dq_da = tf.gradients(q, self.a)[0] # partial Q, partial a
upper_method = lambda: dq_da * (upper - self.a) / (upper - lower)
lower_method = lambda: dq_da * (self.a - lower) / (upper - lower)
# if gradient suggests increasing action, apply upper method
# else, lower method
adjust_dq_da = tf.cond(tf.greater(dq_da, 0), upper_method, lower_method)
grad = tf.gradients(self.a, self.ae_params, grad_ys=adjust_dq_da)
# apply gradient to the parameters in actor network
self.atrain = tf.train.AdamOptimizer(-0.0001).apply_gradients(zip(grad, self.ae_params))
我得到错误:
ValueError: Shape must be rank 0 but is rank 2 for 'actor_gradient/cond/Switch' (op: 'Switch') with input shapes: [?,1], [?,1].
有什么办法可以改善吗?
最后,我通过创建一个用于反转渐变的占位符来解决我的问题。
inverting_gradients_placeholder = tf.placeholder(tf.float32, shape=[None, 1], name='inverting_gradeints')
第一步,我计算 Q
值 w.r.t 动作 a
的梯度。
dq_da = tf.gradient(q, a)[0]
其次,我采用动作梯度a
w.r.t actor参数a_params
。同时定义actor更新操作。
grad = tf.gradients(a, a_params, grad_ys=inverting_gradients_placeholder)
train_actor = tf.train.AdamOptimizer(learning_rate_actor).apply_gradients(zip(grad, a_params))
最后,实现反转梯度。
# get dq/da array, action array
dq_das, actions = sess.run([dq_da, a], feed_dict={state_placeholder: batch_state})
# inverting gradients, if dq_da >= 0, apply upper method, else lower method
inverting_gradients = []
for dq_da, action in zip(dq_das, actions):
if dq_da >= 0.0:
inverting_gradients.append(dq_da * (upper - action) / (upper - lower))
else:
inverting_gradients.append(dq_da * (action - lower) / (upper - lower))
inverting_gradients = np.array(inverting_gradients).reshape(-1, 1)
# update actor
sess.run(train_actor, feed_dict={state_placeholder: batch_state, inverting_gradients_placeholder: inverting_gradients})