tf_agents 没有正确学习简单的环境

Question

我成功关注了 this official tensorflow tutorial for training an agent to solve the 'CartPole-v0' gym environment. I only diverged from the tutorial in that I did not use reverb，因为 Windows 不支持它。我尝试修改示例来训练代理解决我自己的（极其简单的）环境，但它在 10,000 次迭代后未能收敛到一个解决方案，我觉得这应该绰绰有余。

我尝试调整训练迭代次数、学习率、批量大小、折扣以及我能想到的所有其他内容。对结果没有任何影响。

我希望代理收敛到一个总是获得 +1 奖励的策略（理想情况下只有几百次迭代，因为这个环境非常简单），而不是偶尔下降到 -1 的策略。相反，这是实际结果的图表：

（文字很小所以我会说橙色是episode length in steps，蓝色是平均奖励。X轴是训练迭代次数，从0到10,000。）

代码

这里的所有内容都是运行从上到下，但我将其放在单独的代码块中以便于 read/debug。

进口

import numpy as np
import tf_agents as tfa
import tensorflow as tf

# for reproducability
np.random.seed(100)
tf.random.set_seed(100)

环境。您可能可以跳过阅读整个 class，它通过 validate_py_environment 就好了，所以我认为这里没有问题。

# This environment is very simple. You start at position = 0
# and at every step you can either move left, right, or do nothing
# if you move right 3 times (and get to position = 3) then you win.
# otherwise, if you go left to position = -3, the you lose.
# you also lose if it takes more than 10 steps.
# losing gives you -1 reward, winning gives you +1
class SimpleGame(tfa.environments.py_environment.PyEnvironment):
    def __init__(self):
        # 0 - move left
        # 1 - do nothing
        # 2 - move right
        self._action_spec = tfa.specs.array_spec.BoundedArraySpec(
            shape = (),
            dtype = np.int32,
            minimum = 0,
            maximum = 2,
            name = 'action'
        )
        
        self._observation_spec = tfa.specs.array_spec.BoundedArraySpec(
            shape = (1,),
            dtype = np.int32,
            minimum = -3,
            maximum = 3,
            name = 'observation'
        )
        
        self._position = 0
        
        self._step_counter = 0

    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _observe(self):
        return np.array([self._position], dtype = np.int32)
    
    def _reset(self):
        self._position = 0
        self._step_counter = 0
        return tfa.trajectories.time_step.restart(self._observe())
    
    def _step(self, action):
        if abs(self._position) >= 3 or self._step_counter >= 10:
            return self.reset()
        
        self._step_counter += 1
        
        if action == 0:
            self._position -= 1
        elif action == 1:
            pass
        elif action == 2:
            self._position += 1
        else:
            raise ValueError('`action` should be 0 (left), 1 (do nothing) or 2 (right). You gave `%s`' % action)
        
        reward = 0
        if self._position >= 3:
            reward = 1
        elif self._position <= -3 or self._step_counter >= 10:
            reward = -1
        
        if reward != 0:
            return tfa.trajectories.time_step.termination(
                self._observe(),
                reward
            )
        else: # this game isn't over yet
            return tfa.trajectories.time_step.transition(
                self._observe(),
                reward = 0, 
                discount = 1.0 
            )

# no issue here:
tfa.environments.utils.validate_py_environment(SimpleGame(), episodes=10)

环境实例

train_py_env = SimpleGame()
test_py_env = SimpleGame()
train_env = tfa.environments.tf_py_environment.TFPyEnvironment(train_py_env)
test_env = tfa.environments.tf_py_environment.TFPyEnvironment(test_py_env)

代理创建

q_network = tfa.networks.sequential.Sequential([
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(3, activation = None) 
])

agent = tfa.agents.dqn.dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network = q_network,
    optimizer = tf.keras.optimizers.Adam(),
    td_errors_loss_fn = tfa.utils.common.element_wise_squared_loss,
    n_step_update = 1
)

agent.initialize()

agent.train = tfa.utils.common.function(agent.train)

政策评估员。您可能可以跳过阅读本文。我相信这是正确的。

# simulate some episodes by following the given policy
# return the average reward and episode length
def evaluate_policy(env, policy, episodes = 10):
    total_reward = 0.0
    total_steps = 0
    
    for ep in range(episodes):
        time_step = env.reset()
        
        # this will always just add 0, but kept it for completion
        total_reward += time_step.reward.numpy()[0]
        
        while not time_step.is_last(): 
            action_step = policy.action(time_step)
            action_tensor = action_step.action
            action = action_tensor.numpy()[0]
            
            time_step = env.step(action)
            
            total_reward += time_step.reward.numpy()[0]
            total_steps += 1
        
    average_reward = total_reward / episodes
    average_ep_length = total_steps / episodes
    return average_reward, average_ep_length

# evaluate policy before any training
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("initial policy gives average reward of %.2f after an average %d steps" % (avg_reward, avg_length))
#> initial policy gives average reward of -1.00 after an average 10 steps

回放缓冲区

replay_buffer = tfa.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec = agent.collect_data_spec,
    batch_size = train_env.batch_size,
    max_length = 10000
)

replay_dataset = replay_buffer.as_dataset(
    num_parallel_calls = 3, 
    sample_batch_size = 64,
    num_steps = 2
).prefetch(3)

def record_experience(buffer, time_step, action_step, next_time_step):
    buffer.add_batch(
        tfa.trajectories.trajectory.from_transition(time_step, action_step, next_time_step)
    )

replay_dataset_iterator = iter(replay_dataset)

训练过程

time_step = train_env.reset()

episode_length_history = []
reward_history = []

for step in range(10000 + 1): # +1 just to so 10000 is included in the plot
    
    for _ in range(10):
        action_step = agent.collect_policy.action(time_step)
        
        action_tensor = action_step.action
        action = action_tensor.numpy()[0]
        new_time_step = train_env.step(action)
        
        reward = new_time_step.reward.numpy()[0]
        
        record_experience(replay_buffer, time_step, action_step, new_time_step)
        
        time_step = new_time_step
    
    training_experience, unused_diagnostics_info = next(replay_dataset_iterator)
    
    train_step = agent.train(training_experience)
    loss = train_step.loss
    
    print("step: %d, loss: %d" % (step, loss))
    
    if step % 100 == 0:
        avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
        print("average reward: %.2f average steps: %d" % (avg_reward, avg_length))
        
        # record for the plot
        reward_history.append(avg_reward)
        episode_length_history.append(avg_length)

绘图（与问题无关，只是为了完成）

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.set_xlabel('train iterations')

fig.subplots_adjust(right=0.8)

reward_ax = ax
length_ax = ax.twinx()

length_ax.set_frame_on(True)
length_ax.patch.set_visible(False)
length_ax.set_ylabel('episode length', color = "orange")
length_ax.tick_params(axis = 'y', colors = "orange")

reward_ax.set_ylabel('reward', color = "blue")
reward_ax.tick_params(axis = 'y', colors = "blue")

train_iterations = [i * 100 for i in range(len(reward_history))]
reward_ax.plot(train_iterations, reward_history, color = "blue")
length_ax.plot(train_iterations, episode_length_history, color = "orange")

plt.show()

Answer 1

问题的原因是智能体没有动力快速解决问题，因为在 10 步和 3 步后向右走都会导致相同的奖励.因为没有观察到计步器，代理不可能将花费太长时间与失败联系起来；所以偶尔会超过 10 步，丢失，无法从经验中学习。

我通过在每一步给予 -0.1 奖励来解决这个问题，这会激励代理以尽可能少的步骤解决环境问题（使其永远不会违反 10 步损失规则）。

我还通过将 DqnAgent 构造函数的 epsilon_greedy 参数增加到 0.5（默认值为 0.1）来加快学习过程，以使其能够更快地探索整个环境。

tf_agents 没有正确学习简单的环境

tf_agents doesn't properly learn a simple environment

python

reinforcement-learning

tensorflow

tensorflow-agents

代码

这里的所有内容都是运行从上到下，但我将其放在单独的代码块中以便于 read/debug。

tf_agents 没有正确学习简单的环境

tf_agents doesn't properly learn a simple environment

python

reinforcement-learning

tensorflow

tensorflow-agents

代码

这里的所有内容都是 运行 从上到下，但我将其放在单独的代码块中以便于 read/debug。

这里的所有内容都是运行从上到下，但我将其放在单独的代码块中以便于 read/debug。