tf_agents 没有正确学习简单的环境
tf_agents doesn't properly learn a simple environment
我成功关注了 this official tensorflow tutorial for training an agent to solve the 'CartPole-v0' gym environment. I only diverged from the tutorial in that I did not use reverb,因为 Windows 不支持它。我尝试修改示例来训练代理解决我自己的(极其简单的)环境,但它在 10,000 次迭代后未能收敛到一个解决方案,我觉得这应该绰绰有余。
我尝试调整训练迭代次数、学习率、批量大小、折扣以及我能想到的所有其他内容。对结果没有任何影响。
我希望代理收敛到一个总是获得 +1 奖励的策略(理想情况下只有几百次迭代,因为这个环境非常简单),而不是偶尔下降到 -1 的策略。相反,这是实际结果的图表:
(文字很小所以我会说橙色是episode length in steps,蓝色是平均奖励。X轴是训练迭代次数,从0到10,000。)
代码
这里的所有内容都是 运行 从上到下,但我将其放在单独的代码块中以便于 read/debug。
进口
import numpy as np
import tf_agents as tfa
import tensorflow as tf
# for reproducability
np.random.seed(100)
tf.random.set_seed(100)
环境。您可能可以跳过阅读整个 class,它通过 validate_py_environment
就好了,所以我认为这里没有问题。
# This environment is very simple. You start at position = 0
# and at every step you can either move left, right, or do nothing
# if you move right 3 times (and get to position = 3) then you win.
# otherwise, if you go left to position = -3, the you lose.
# you also lose if it takes more than 10 steps.
# losing gives you -1 reward, winning gives you +1
class SimpleGame(tfa.environments.py_environment.PyEnvironment):
def __init__(self):
# 0 - move left
# 1 - do nothing
# 2 - move right
self._action_spec = tfa.specs.array_spec.BoundedArraySpec(
shape = (),
dtype = np.int32,
minimum = 0,
maximum = 2,
name = 'action'
)
self._observation_spec = tfa.specs.array_spec.BoundedArraySpec(
shape = (1,),
dtype = np.int32,
minimum = -3,
maximum = 3,
name = 'observation'
)
self._position = 0
self._step_counter = 0
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _observe(self):
return np.array([self._position], dtype = np.int32)
def _reset(self):
self._position = 0
self._step_counter = 0
return tfa.trajectories.time_step.restart(self._observe())
def _step(self, action):
if abs(self._position) >= 3 or self._step_counter >= 10:
return self.reset()
self._step_counter += 1
if action == 0:
self._position -= 1
elif action == 1:
pass
elif action == 2:
self._position += 1
else:
raise ValueError('`action` should be 0 (left), 1 (do nothing) or 2 (right). You gave `%s`' % action)
reward = 0
if self._position >= 3:
reward = 1
elif self._position <= -3 or self._step_counter >= 10:
reward = -1
if reward != 0:
return tfa.trajectories.time_step.termination(
self._observe(),
reward
)
else: # this game isn't over yet
return tfa.trajectories.time_step.transition(
self._observe(),
reward = 0,
discount = 1.0
)
# no issue here:
tfa.environments.utils.validate_py_environment(SimpleGame(), episodes=10)
环境实例
train_py_env = SimpleGame()
test_py_env = SimpleGame()
train_env = tfa.environments.tf_py_environment.TFPyEnvironment(train_py_env)
test_env = tfa.environments.tf_py_environment.TFPyEnvironment(test_py_env)
代理创建
q_network = tfa.networks.sequential.Sequential([
tf.keras.layers.Dense(16, activation = 'relu'),
tf.keras.layers.Dense(3, activation = None)
])
agent = tfa.agents.dqn.dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network = q_network,
optimizer = tf.keras.optimizers.Adam(),
td_errors_loss_fn = tfa.utils.common.element_wise_squared_loss,
n_step_update = 1
)
agent.initialize()
agent.train = tfa.utils.common.function(agent.train)
政策评估员。您可能可以跳过阅读本文。我相信这是正确的。
# simulate some episodes by following the given policy
# return the average reward and episode length
def evaluate_policy(env, policy, episodes = 10):
total_reward = 0.0
total_steps = 0
for ep in range(episodes):
time_step = env.reset()
# this will always just add 0, but kept it for completion
total_reward += time_step.reward.numpy()[0]
while not time_step.is_last():
action_step = policy.action(time_step)
action_tensor = action_step.action
action = action_tensor.numpy()[0]
time_step = env.step(action)
total_reward += time_step.reward.numpy()[0]
total_steps += 1
average_reward = total_reward / episodes
average_ep_length = total_steps / episodes
return average_reward, average_ep_length
# evaluate policy before any training
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("initial policy gives average reward of %.2f after an average %d steps" % (avg_reward, avg_length))
#> initial policy gives average reward of -1.00 after an average 10 steps
回放缓冲区
replay_buffer = tfa.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec = agent.collect_data_spec,
batch_size = train_env.batch_size,
max_length = 10000
)
replay_dataset = replay_buffer.as_dataset(
num_parallel_calls = 3,
sample_batch_size = 64,
num_steps = 2
).prefetch(3)
def record_experience(buffer, time_step, action_step, next_time_step):
buffer.add_batch(
tfa.trajectories.trajectory.from_transition(time_step, action_step, next_time_step)
)
replay_dataset_iterator = iter(replay_dataset)
训练过程
time_step = train_env.reset()
episode_length_history = []
reward_history = []
for step in range(10000 + 1): # +1 just to so 10000 is included in the plot
for _ in range(10):
action_step = agent.collect_policy.action(time_step)
action_tensor = action_step.action
action = action_tensor.numpy()[0]
new_time_step = train_env.step(action)
reward = new_time_step.reward.numpy()[0]
record_experience(replay_buffer, time_step, action_step, new_time_step)
time_step = new_time_step
training_experience, unused_diagnostics_info = next(replay_dataset_iterator)
train_step = agent.train(training_experience)
loss = train_step.loss
print("step: %d, loss: %d" % (step, loss))
if step % 100 == 0:
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("average reward: %.2f average steps: %d" % (avg_reward, avg_length))
# record for the plot
reward_history.append(avg_reward)
episode_length_history.append(avg_length)
绘图(与问题无关,只是为了完成)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xlabel('train iterations')
fig.subplots_adjust(right=0.8)
reward_ax = ax
length_ax = ax.twinx()
length_ax.set_frame_on(True)
length_ax.patch.set_visible(False)
length_ax.set_ylabel('episode length', color = "orange")
length_ax.tick_params(axis = 'y', colors = "orange")
reward_ax.set_ylabel('reward', color = "blue")
reward_ax.tick_params(axis = 'y', colors = "blue")
train_iterations = [i * 100 for i in range(len(reward_history))]
reward_ax.plot(train_iterations, reward_history, color = "blue")
length_ax.plot(train_iterations, episode_length_history, color = "orange")
plt.show()
问题的原因是智能体没有动力快速解决问题,因为在 10 步和 3 步后向右走都会导致相同的奖励.因为没有观察到计步器,代理不可能将花费太长时间与失败联系起来;所以偶尔会超过 10 步,丢失,无法从经验中学习。
我通过在每一步给予 -0.1 奖励来解决这个问题,这会激励代理以尽可能少的步骤解决环境问题(使其永远不会违反 10 步损失规则)。
我还通过将 DqnAgent 构造函数的 epsilon_greedy 参数增加到 0.5(默认值为 0.1)来加快学习过程,以使其能够更快地探索整个环境。
我成功关注了 this official tensorflow tutorial for training an agent to solve the 'CartPole-v0' gym environment. I only diverged from the tutorial in that I did not use reverb,因为 Windows 不支持它。我尝试修改示例来训练代理解决我自己的(极其简单的)环境,但它在 10,000 次迭代后未能收敛到一个解决方案,我觉得这应该绰绰有余。
我尝试调整训练迭代次数、学习率、批量大小、折扣以及我能想到的所有其他内容。对结果没有任何影响。
我希望代理收敛到一个总是获得 +1 奖励的策略(理想情况下只有几百次迭代,因为这个环境非常简单),而不是偶尔下降到 -1 的策略。相反,这是实际结果的图表:
(文字很小所以我会说橙色是episode length in steps,蓝色是平均奖励。X轴是训练迭代次数,从0到10,000。)
代码
这里的所有内容都是 运行 从上到下,但我将其放在单独的代码块中以便于 read/debug。
进口
import numpy as np
import tf_agents as tfa
import tensorflow as tf
# for reproducability
np.random.seed(100)
tf.random.set_seed(100)
环境。您可能可以跳过阅读整个 class,它通过 validate_py_environment
就好了,所以我认为这里没有问题。
# This environment is very simple. You start at position = 0
# and at every step you can either move left, right, or do nothing
# if you move right 3 times (and get to position = 3) then you win.
# otherwise, if you go left to position = -3, the you lose.
# you also lose if it takes more than 10 steps.
# losing gives you -1 reward, winning gives you +1
class SimpleGame(tfa.environments.py_environment.PyEnvironment):
def __init__(self):
# 0 - move left
# 1 - do nothing
# 2 - move right
self._action_spec = tfa.specs.array_spec.BoundedArraySpec(
shape = (),
dtype = np.int32,
minimum = 0,
maximum = 2,
name = 'action'
)
self._observation_spec = tfa.specs.array_spec.BoundedArraySpec(
shape = (1,),
dtype = np.int32,
minimum = -3,
maximum = 3,
name = 'observation'
)
self._position = 0
self._step_counter = 0
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _observe(self):
return np.array([self._position], dtype = np.int32)
def _reset(self):
self._position = 0
self._step_counter = 0
return tfa.trajectories.time_step.restart(self._observe())
def _step(self, action):
if abs(self._position) >= 3 or self._step_counter >= 10:
return self.reset()
self._step_counter += 1
if action == 0:
self._position -= 1
elif action == 1:
pass
elif action == 2:
self._position += 1
else:
raise ValueError('`action` should be 0 (left), 1 (do nothing) or 2 (right). You gave `%s`' % action)
reward = 0
if self._position >= 3:
reward = 1
elif self._position <= -3 or self._step_counter >= 10:
reward = -1
if reward != 0:
return tfa.trajectories.time_step.termination(
self._observe(),
reward
)
else: # this game isn't over yet
return tfa.trajectories.time_step.transition(
self._observe(),
reward = 0,
discount = 1.0
)
# no issue here:
tfa.environments.utils.validate_py_environment(SimpleGame(), episodes=10)
环境实例
train_py_env = SimpleGame()
test_py_env = SimpleGame()
train_env = tfa.environments.tf_py_environment.TFPyEnvironment(train_py_env)
test_env = tfa.environments.tf_py_environment.TFPyEnvironment(test_py_env)
代理创建
q_network = tfa.networks.sequential.Sequential([
tf.keras.layers.Dense(16, activation = 'relu'),
tf.keras.layers.Dense(3, activation = None)
])
agent = tfa.agents.dqn.dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network = q_network,
optimizer = tf.keras.optimizers.Adam(),
td_errors_loss_fn = tfa.utils.common.element_wise_squared_loss,
n_step_update = 1
)
agent.initialize()
agent.train = tfa.utils.common.function(agent.train)
政策评估员。您可能可以跳过阅读本文。我相信这是正确的。
# simulate some episodes by following the given policy
# return the average reward and episode length
def evaluate_policy(env, policy, episodes = 10):
total_reward = 0.0
total_steps = 0
for ep in range(episodes):
time_step = env.reset()
# this will always just add 0, but kept it for completion
total_reward += time_step.reward.numpy()[0]
while not time_step.is_last():
action_step = policy.action(time_step)
action_tensor = action_step.action
action = action_tensor.numpy()[0]
time_step = env.step(action)
total_reward += time_step.reward.numpy()[0]
total_steps += 1
average_reward = total_reward / episodes
average_ep_length = total_steps / episodes
return average_reward, average_ep_length
# evaluate policy before any training
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("initial policy gives average reward of %.2f after an average %d steps" % (avg_reward, avg_length))
#> initial policy gives average reward of -1.00 after an average 10 steps
回放缓冲区
replay_buffer = tfa.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec = agent.collect_data_spec,
batch_size = train_env.batch_size,
max_length = 10000
)
replay_dataset = replay_buffer.as_dataset(
num_parallel_calls = 3,
sample_batch_size = 64,
num_steps = 2
).prefetch(3)
def record_experience(buffer, time_step, action_step, next_time_step):
buffer.add_batch(
tfa.trajectories.trajectory.from_transition(time_step, action_step, next_time_step)
)
replay_dataset_iterator = iter(replay_dataset)
训练过程
time_step = train_env.reset()
episode_length_history = []
reward_history = []
for step in range(10000 + 1): # +1 just to so 10000 is included in the plot
for _ in range(10):
action_step = agent.collect_policy.action(time_step)
action_tensor = action_step.action
action = action_tensor.numpy()[0]
new_time_step = train_env.step(action)
reward = new_time_step.reward.numpy()[0]
record_experience(replay_buffer, time_step, action_step, new_time_step)
time_step = new_time_step
training_experience, unused_diagnostics_info = next(replay_dataset_iterator)
train_step = agent.train(training_experience)
loss = train_step.loss
print("step: %d, loss: %d" % (step, loss))
if step % 100 == 0:
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("average reward: %.2f average steps: %d" % (avg_reward, avg_length))
# record for the plot
reward_history.append(avg_reward)
episode_length_history.append(avg_length)
绘图(与问题无关,只是为了完成)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xlabel('train iterations')
fig.subplots_adjust(right=0.8)
reward_ax = ax
length_ax = ax.twinx()
length_ax.set_frame_on(True)
length_ax.patch.set_visible(False)
length_ax.set_ylabel('episode length', color = "orange")
length_ax.tick_params(axis = 'y', colors = "orange")
reward_ax.set_ylabel('reward', color = "blue")
reward_ax.tick_params(axis = 'y', colors = "blue")
train_iterations = [i * 100 for i in range(len(reward_history))]
reward_ax.plot(train_iterations, reward_history, color = "blue")
length_ax.plot(train_iterations, episode_length_history, color = "orange")
plt.show()
问题的原因是智能体没有动力快速解决问题,因为在 10 步和 3 步后向右走都会导致相同的奖励.因为没有观察到计步器,代理不可能将花费太长时间与失败联系起来;所以偶尔会超过 10 步,丢失,无法从经验中学习。
我通过在每一步给予 -0.1 奖励来解决这个问题,这会激励代理以尽可能少的步骤解决环境问题(使其永远不会违反 10 步损失规则)。
我还通过将 DqnAgent 构造函数的 epsilon_greedy 参数增加到 0.5(默认值为 0.1)来加快学习过程,以使其能够更快地探索整个环境。