如何将 AttentionMechanism 与 MultiRNNCell 和 dynamic_decode 一起使用?
How to use AttentionMechanism with MultiRNNCell and dynamic_decode?
我想创建一个使用注意机制的基于多层动态 RNN 的解码器。为此,我首先创建一个注意力机制:
attention_mechanism = BahdanauAttention(num_units=ATTENTION_UNITS,
memory=encoder_outputs,
normalize=True)
然后我使用 AttentionWrapper
用注意力机制包装一个 LSTM 单元:
attention_wrapper = AttentionWrapper(cell=self._create_lstm_cell(DECODER_SIZE),
attention_mechanism=attention_mechanism,
output_attention=False,
alignment_history=True,
attention_layer_size=ATTENTION_LAYER_SIZE)
其中self._create_lstm_cell
定义如下:
@staticmethod
def _create_lstm_cell(cell_size):
return BasicLSTMCell(cell_size)
然后我会做一些簿记(例如创建我的 MultiRNNCell
、创建初始状态、创建 TrainingHelper
等)
attention_zero = attention_wrapper.zero_state(batch_size=tf.flags.FLAGS.batch_size, dtype=tf.float32)
# define initial state
initial_state = attention_zero.clone(cell_state=encoder_final_states[0])
training_helper = TrainingHelper(inputs=self.y, # feed in ground truth
sequence_length=self.y_lengths) # feed in sequence lengths
layered_cell = MultiRNNCell(
[attention_wrapper] + [ResidualWrapper(self._create_lstm_cell(cell_size=DECODER_SIZE))
for _ in range(NUMBER_OF_DECODER_LAYERS - 1)])
decoder = BasicDecoder(cell=layered_cell,
helper=training_helper,
initial_state=initial_state)
decoder_outputs, decoder_final_state, decoder_final_sequence_lengths = dynamic_decode(decoder=decoder,
maximum_iterations=tf.flags.FLAGS.max_number_of_scans // 12,
impute_finished=True)
但我收到以下错误:AttributeError: 'LSTMStateTuple' object has no attribute 'attention'
。
向 MultiRNNCell 动态解码器添加注意机制的正确方法是什么?
你试过使用tf.contrib提供的attention wrapper吗?
这是一个同时使用注意力包装器和 dropout 的示例:
cells = []
for i in range(n_layers):
cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.AttentionCellWrapper(
cell, attn_length=40, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=0.5)
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
init_state = cell.zero_state(batch_size, tf.float32)
您需要做的是创建 MultiLayer 单元格,然后用 AttentionWrapper 包装它,下面是一个示例:
def decoding_layer(dec_input, encoder_state,
target_sequence_length, max_target_sequence_length,
rnn_size,
num_layers, target_vocab_to_int, target_vocab_size,
batch_size, keep_prob, decoding_embedding_size , encoder_outputs):
"""
Create decoding layer
:param dec_input: Decoder input
:param encoder_state: Encoder state
:param target_sequence_length: The lengths of each sequence in the target batch
:param max_target_sequence_length: Maximum length of target sequences
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param target_vocab_to_int: Dictionary to go from the target words to an id
:param target_vocab_size: Size of target vocabulary
:param batch_size: The size of the batch
:param keep_prob: Dropout keep probability
:param decoding_embedding_size: Decoding embedding size
:return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
"""
# 1. Decoder Embedding
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
# 2. Construct the decoder cell
def create_cell(rnn_size):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
dec_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
#dec_cell = tf.contrib.rnn.MultiRNNCell(cells_a)
#attention details
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism , attention_layer_size=rnn_size/2)
attn_zero = attn_cell.zero_state(batch_size , tf.float32 )
attn_zero = attn_zero.clone(cell_state = encoder_state)
#new_state = tf.contrib.seq2seq.AttentionWrapperState(cell_state = encoder_state, attention = attn_zero , time = 0 ,alignments=None , alignment_history=())
"""out_cell = tf.contrib.rnn.OutputProjectionWrapper(
attn_cell, target_vocab_size, reuse=True
)"""
#end of attention
#tensor_util.make_tensor_proto(attn_cell)
output_layer = Dense(target_vocab_size,
kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
with tf.variable_scope("decode"):
train_decoder_out = decoding_layer_train(attn_zero, attn_cell, dec_embed_input,
target_sequence_length, max_target_sequence_length, output_layer, keep_prob)
with tf.variable_scope("decode", reuse=True):
infer_decoder_out = decoding_layer_infer(attn_zero, attn_cell, dec_embeddings,
target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], max_target_sequence_length,
target_vocab_size, output_layer, batch_size, keep_prob)
return (train_decoder_out, infer_decoder_out)
我想创建一个使用注意机制的基于多层动态 RNN 的解码器。为此,我首先创建一个注意力机制:
attention_mechanism = BahdanauAttention(num_units=ATTENTION_UNITS,
memory=encoder_outputs,
normalize=True)
然后我使用 AttentionWrapper
用注意力机制包装一个 LSTM 单元:
attention_wrapper = AttentionWrapper(cell=self._create_lstm_cell(DECODER_SIZE),
attention_mechanism=attention_mechanism,
output_attention=False,
alignment_history=True,
attention_layer_size=ATTENTION_LAYER_SIZE)
其中self._create_lstm_cell
定义如下:
@staticmethod
def _create_lstm_cell(cell_size):
return BasicLSTMCell(cell_size)
然后我会做一些簿记(例如创建我的 MultiRNNCell
、创建初始状态、创建 TrainingHelper
等)
attention_zero = attention_wrapper.zero_state(batch_size=tf.flags.FLAGS.batch_size, dtype=tf.float32)
# define initial state
initial_state = attention_zero.clone(cell_state=encoder_final_states[0])
training_helper = TrainingHelper(inputs=self.y, # feed in ground truth
sequence_length=self.y_lengths) # feed in sequence lengths
layered_cell = MultiRNNCell(
[attention_wrapper] + [ResidualWrapper(self._create_lstm_cell(cell_size=DECODER_SIZE))
for _ in range(NUMBER_OF_DECODER_LAYERS - 1)])
decoder = BasicDecoder(cell=layered_cell,
helper=training_helper,
initial_state=initial_state)
decoder_outputs, decoder_final_state, decoder_final_sequence_lengths = dynamic_decode(decoder=decoder,
maximum_iterations=tf.flags.FLAGS.max_number_of_scans // 12,
impute_finished=True)
但我收到以下错误:AttributeError: 'LSTMStateTuple' object has no attribute 'attention'
。
向 MultiRNNCell 动态解码器添加注意机制的正确方法是什么?
你试过使用tf.contrib提供的attention wrapper吗?
这是一个同时使用注意力包装器和 dropout 的示例:
cells = []
for i in range(n_layers):
cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.AttentionCellWrapper(
cell, attn_length=40, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=0.5)
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
init_state = cell.zero_state(batch_size, tf.float32)
您需要做的是创建 MultiLayer 单元格,然后用 AttentionWrapper 包装它,下面是一个示例:
def decoding_layer(dec_input, encoder_state,
target_sequence_length, max_target_sequence_length,
rnn_size,
num_layers, target_vocab_to_int, target_vocab_size,
batch_size, keep_prob, decoding_embedding_size , encoder_outputs):
"""
Create decoding layer
:param dec_input: Decoder input
:param encoder_state: Encoder state
:param target_sequence_length: The lengths of each sequence in the target batch
:param max_target_sequence_length: Maximum length of target sequences
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param target_vocab_to_int: Dictionary to go from the target words to an id
:param target_vocab_size: Size of target vocabulary
:param batch_size: The size of the batch
:param keep_prob: Dropout keep probability
:param decoding_embedding_size: Decoding embedding size
:return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
"""
# 1. Decoder Embedding
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
# 2. Construct the decoder cell
def create_cell(rnn_size):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
dec_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
#dec_cell = tf.contrib.rnn.MultiRNNCell(cells_a)
#attention details
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism , attention_layer_size=rnn_size/2)
attn_zero = attn_cell.zero_state(batch_size , tf.float32 )
attn_zero = attn_zero.clone(cell_state = encoder_state)
#new_state = tf.contrib.seq2seq.AttentionWrapperState(cell_state = encoder_state, attention = attn_zero , time = 0 ,alignments=None , alignment_history=())
"""out_cell = tf.contrib.rnn.OutputProjectionWrapper(
attn_cell, target_vocab_size, reuse=True
)"""
#end of attention
#tensor_util.make_tensor_proto(attn_cell)
output_layer = Dense(target_vocab_size,
kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
with tf.variable_scope("decode"):
train_decoder_out = decoding_layer_train(attn_zero, attn_cell, dec_embed_input,
target_sequence_length, max_target_sequence_length, output_layer, keep_prob)
with tf.variable_scope("decode", reuse=True):
infer_decoder_out = decoding_layer_infer(attn_zero, attn_cell, dec_embeddings,
target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], max_target_sequence_length,
target_vocab_size, output_layer, batch_size, keep_prob)
return (train_decoder_out, infer_decoder_out)