DQN 无法正常工作
DQN not working Properly
我正在尝试使用 keras 在 Python 中编写自己的 DQN。我认为我的逻辑是正确的。我正在 CartPole 环境中尝试,但奖励在 50,000 集后并没有增加。任何帮助将不胜感激。目前我不关注决斗或双 DQN 部分。
class ReplayBuffer:
def __init__(self, size=100000):
self.buffer=deque(maxlen=size)
def sample(self, sample_size):
return random.sample(self.buffer, sample_size)
def add_to_buffer(self, experience):
self.buffer.append(experience)
def generator(number):
return(i for i in range(number))
def epsilon_greedy_policy(q_values, epsilon):
number_of_actions =len(q_values)
action_probabilites = np.ones(number_of_actions, dtype=float)*epsilon/number_of_actions
best_action = np.argmax(q_values)
action_probabilites[best_action]+= (1-epsilon)
return np.random.choice(number_of_actions, p=action_probabilites)
class DQNAgent:
def __init__(self, env, model, gamma):
self.env=env
self.model=model
self.replay_buffer=ReplayBuffer()
self.gamma=gamma
self.state_dim=env.observation_space.shape[0]
def train_model(self, training_data, training_label):
self.model.fit(training_data, training_label, batch_size=32, verbose=0)
def predict_one(self, state):
return self.model.predict(state.reshape(1, self.state_dim)).flatten()
def experience_replay(self, experiences):
import pdb; pdb.set_trace()
states, actions, rewards, next_states=zip(*[[experience[0], experience[1], experience[2], experience[3]] for experience in experiences])
states=np.asarray(states)
place_holder_state=np.zeros(self.state_dim)
next_states_ = np.asarray([(place_holder_state if next_state is None else next_state) for next_state in next_states])
q_values_for_states=self.model.predict(states)
q_values_for_next_states=self.model.predict(next_states_)
for x in generator(len(experiences)):
y_true=rewards[x]
if next_states[x].any():
y_true +=self.gamma*(np.amax(q_values_for_next_states[x]))
q_values_for_states[x][actions[x]]=y_true
self.train_model(states, q_values_for_states)
def fit(self, number_of_epsiodes, batch_size):
for _ in generator(number_of_epsiodes):
total_reward=0
state=env.reset()
while True:
#self.env.render()
q_values_for_state=self.predict_one(state)
action=epsilon_greedy_policy(q_values_for_state, 0.1)
next_state, reward, done, _=env.step(action)
self.replay_buffer.add_to_buffer([state, action, reward, next_state])
state = next_state
total_reward += reward
if len(self.replay_buffer.buffer) > 50:
experience=self.replay_buffer.sample(batch_size)
self.experience_replay(experience)
if done:
break
print("Total reward:", total_reward)
env = gym.make('CartPole-v0')
model=create_model(env.observation_space.shape[0], env.action_space.n)
agent=DQNAgent(env, model, 0.99)
agent.fit(100000, 32)'
错误就出在这两行
q_values_for_states=self.model.predict(states)
q_values_for_next_states=self.model.predict(next_states_)
您对 Q 及其目标拥有相同的网络。在 DQN 论文中,作者使用了两个独立的网络,并通过复制 Q 网络权重每 X 步更新目标网络。
正确的方程是(伪代码)
T = R + gamma * max(QT(next_state)) # target
E = T - Q(state) # error
所以你的方程应该是
q_values_for_states=self.model.predict(states)
q_values_for_next_states=self.target_model.predict(next_states_)
然后你更新 target_model
。
在最近的论文(例如 DDPG 论文)中,他们不是每 X 步复制权重,而是对每个状态执行软更新,即
QT_weights = tau*Q_weights + (1-tau)*QT_weights
相反,您所做的就像每一步都更新目标网络。这使得算法非常不稳定,正如 DQN 的作者在他们的论文中所说的那样。
此外,我会增加用于学习的最小样本数。当只收集到 50 个样本时,你就开始学习了,这太少了。在论文中,他们使用的方式更多,对于手推车杆,我会等待收集 1000 个样本(考虑到您应该平衡杆至少 1000 步左右)。
在 fit 函数中我不得不添加
if done:
next_state = None
我正在尝试使用 keras 在 Python 中编写自己的 DQN。我认为我的逻辑是正确的。我正在 CartPole 环境中尝试,但奖励在 50,000 集后并没有增加。任何帮助将不胜感激。目前我不关注决斗或双 DQN 部分。
class ReplayBuffer:
def __init__(self, size=100000):
self.buffer=deque(maxlen=size)
def sample(self, sample_size):
return random.sample(self.buffer, sample_size)
def add_to_buffer(self, experience):
self.buffer.append(experience)
def generator(number):
return(i for i in range(number))
def epsilon_greedy_policy(q_values, epsilon):
number_of_actions =len(q_values)
action_probabilites = np.ones(number_of_actions, dtype=float)*epsilon/number_of_actions
best_action = np.argmax(q_values)
action_probabilites[best_action]+= (1-epsilon)
return np.random.choice(number_of_actions, p=action_probabilites)
class DQNAgent:
def __init__(self, env, model, gamma):
self.env=env
self.model=model
self.replay_buffer=ReplayBuffer()
self.gamma=gamma
self.state_dim=env.observation_space.shape[0]
def train_model(self, training_data, training_label):
self.model.fit(training_data, training_label, batch_size=32, verbose=0)
def predict_one(self, state):
return self.model.predict(state.reshape(1, self.state_dim)).flatten()
def experience_replay(self, experiences):
import pdb; pdb.set_trace()
states, actions, rewards, next_states=zip(*[[experience[0], experience[1], experience[2], experience[3]] for experience in experiences])
states=np.asarray(states)
place_holder_state=np.zeros(self.state_dim)
next_states_ = np.asarray([(place_holder_state if next_state is None else next_state) for next_state in next_states])
q_values_for_states=self.model.predict(states)
q_values_for_next_states=self.model.predict(next_states_)
for x in generator(len(experiences)):
y_true=rewards[x]
if next_states[x].any():
y_true +=self.gamma*(np.amax(q_values_for_next_states[x]))
q_values_for_states[x][actions[x]]=y_true
self.train_model(states, q_values_for_states)
def fit(self, number_of_epsiodes, batch_size):
for _ in generator(number_of_epsiodes):
total_reward=0
state=env.reset()
while True:
#self.env.render()
q_values_for_state=self.predict_one(state)
action=epsilon_greedy_policy(q_values_for_state, 0.1)
next_state, reward, done, _=env.step(action)
self.replay_buffer.add_to_buffer([state, action, reward, next_state])
state = next_state
total_reward += reward
if len(self.replay_buffer.buffer) > 50:
experience=self.replay_buffer.sample(batch_size)
self.experience_replay(experience)
if done:
break
print("Total reward:", total_reward)
env = gym.make('CartPole-v0')
model=create_model(env.observation_space.shape[0], env.action_space.n)
agent=DQNAgent(env, model, 0.99)
agent.fit(100000, 32)'
错误就出在这两行
q_values_for_states=self.model.predict(states)
q_values_for_next_states=self.model.predict(next_states_)
您对 Q 及其目标拥有相同的网络。在 DQN 论文中,作者使用了两个独立的网络,并通过复制 Q 网络权重每 X 步更新目标网络。
正确的方程是(伪代码)
T = R + gamma * max(QT(next_state)) # target
E = T - Q(state) # error
所以你的方程应该是
q_values_for_states=self.model.predict(states)
q_values_for_next_states=self.target_model.predict(next_states_)
然后你更新 target_model
。
在最近的论文(例如 DDPG 论文)中,他们不是每 X 步复制权重,而是对每个状态执行软更新,即
QT_weights = tau*Q_weights + (1-tau)*QT_weights
相反,您所做的就像每一步都更新目标网络。这使得算法非常不稳定,正如 DQN 的作者在他们的论文中所说的那样。
此外,我会增加用于学习的最小样本数。当只收集到 50 个样本时,你就开始学习了,这太少了。在论文中,他们使用的方式更多,对于手推车杆,我会等待收集 1000 个样本(考虑到您应该平衡杆至少 1000 步左右)。
在 fit 函数中我不得不添加
if done:
next_state = None