我认为零填充未正确执行
Zero padding not performed properly I think
我在使用提供的实现时似乎遇到了一些问题 here。
我和发帖的人有点相似的情况,我正在尝试映射和输入到输出。输入是音频文件的样本,输出是长度为 14(长度是静态的)的特征向量。序列长度是可变的,因为音频文件的长度不同,使得包含样本的向量也变得不同长度。
我不是在解决分类问题,而是在解决回归问题,所以任务有点不同。
我的代码如下所示:
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools
start_time = time.time()
print "Preprocessing"
def lazy_property(function):
attribute = '_' + function.__name__
@property
@functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
## Class definition ##
class VariableSequenceLabelling:
def __init__(self, data, target, num_hidden=200, num_layers=3):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_layers = num_layers
self.prediction
self.error
self.optimize
@lazy_property
def length(self):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
@lazy_property
def prediction(self):
# Recurrent network.
output, _ = tf.nn.dynamic_rnn(
rnn_cell.GRUCell(self._num_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length,
)
# Softmax layer.
max_length = int(self.target.get_shape()[1])
num_classes = int(self.target.get_shape()[2])
weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
# Flatten to apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
prediction = tf.reshape(prediction, [-1, max_length, num_classes])
return prediction
@lazy_property
def cost(self):
# Compute cross entropy for each frame.
cross_entropy = self.target * tf.log(self.prediction)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(cross_entropy)
@lazy_property
def optimize(self):
learning_rate = 0.0003
optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(self.cost)
@lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
mistakes = tf.cast(mistakes, tf.float32)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
mistakes *= mask
# Average over actual sequence lengths.
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(mistakes)
@staticmethod
def _weight_and_bias(in_size, out_size):
weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
bias = tf.constant(0.1, shape=[out_size])
return tf.Variable(weight), tf.Variable(bias)
#######################
#Converting file to .wav from .sph file format... God dammit!!!
#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
#train_mylist = train_filelist.read().splitlines()
#test_mylist = test_filelist.read().splitlines()
#for line in train_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_train + new_file)
#for line in test_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_test + new_file)
path_train = "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test = "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"
train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"
os.chdir(path)
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def load_sound_files(file_paths , names_input, data_input):
raw_sounds = []
names_output = []
data_output = []
class_output = []
for fp in file_paths:
X,sr = librosa.load(fp)
raw_sounds.append(X)
index = list(find_all(fp,'-'))
input_index = names_input.index(fp[index[1]+1:index[2]])
names_output.append(names_input[input_index])
data_output.append(data_input[input_index])
class_output.append(binify(data_input[input_index][0]))
return raw_sounds, names_output, data_output, class_output
def generate_list_of_names_data(file_path):
# Proprocess
# extract name and data
name = []
data = []
with open(MFCC_dir) as mfcc_feature_list:
content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
start_index_data = 0
end_index_data = 2
for number in range(0,42):
start = list(find_all(content[start_index_data],'['))[0]
end = list(find_all(content[end_index_data],']'))[0]
end_name = list(find_all(content[start_index_data],' '))[0]
substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
substring_name = content[start_index_data][:end_name]
arr = np.array(substring_data.split(), dtype = float)
data.append(arr)
name.append(substring_name)
start_index_data = start_index_data + +3
end_index_data = end_index_data +3
return name, data
files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
os.chdir(dnn_train)
train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)
max_length = 0 ## Used for variable sequence input
for element in train_data:
if element.size > max_length:
max_length = element.size
NUM_EXAMPLES = len(train_data)/2
test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]
train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##
if __name__ == '__main__':
data = tf.placeholder(tf.float32, [None, max_length, 1])
target = tf.placeholder(tf.float32, [None, 14, 1])
model = VariableSequenceLabelling(data, target)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for epoch in range(10):
for sample_set in range(100):
batch_train = train_data[sample_set]
batch_target = train_output[sample_set]
sess.run(model.optimize, {data: batch_train, target: batch_target})
test_set = test_data[epoch]
test_set_output = test_output[epoch]
error = sess.run(model.error, {data: test_set, target: test_set_output})
print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
错误信息是
Traceback (most recent call last):
File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
sess.run(model.optimize, {data: batch_train, target: batch_target})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
% (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'
据我所知,我收到的错误消息是由于使用了 max_length,并且得到的输入大小不正确 - 这意味着输入没有正确填充零? .. 还是我错了?如果是这样,我该如何解决?我寻求的解决方案似乎并非来自 tensorflow,是否有其他框架在本地执行此操作 - 由于缺少功能,是否建议使用其他框架?
Tensorflow 占位符的形状必须与提供给它们的数据的形状相匹配。在这里,您试图将 [63945] 张量提供给 [?, 138915, 1] 形状的占位符。这些是不兼容的形状和 Tensorflow 抱怨。
在将输入张量输入 Tensorflow 之前,您必须使用 numpy 将输入张量填充到所需的形状。我建议使用 numpy.pad
。 (另请注意维数必须匹配——您可以使用 numpy.reshape
来解决这个问题,或者更改占位符形状并使用 Tensorflow 重塑。)
处理长序列时,通常填充到常见的序列长度会导致内存问题。标准的解决方法是将序列分桶到相似长度的桶中。 Tensorflow seq2seq 示例可能是有用的灵感来源:https://www.tensorflow.org/versions/r0.11/tutorials/seq2seq/index.html
我在使用提供的实现时似乎遇到了一些问题 here。 我和发帖的人有点相似的情况,我正在尝试映射和输入到输出。输入是音频文件的样本,输出是长度为 14(长度是静态的)的特征向量。序列长度是可变的,因为音频文件的长度不同,使得包含样本的向量也变得不同长度。
我不是在解决分类问题,而是在解决回归问题,所以任务有点不同。
我的代码如下所示:
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools
start_time = time.time()
print "Preprocessing"
def lazy_property(function):
attribute = '_' + function.__name__
@property
@functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
## Class definition ##
class VariableSequenceLabelling:
def __init__(self, data, target, num_hidden=200, num_layers=3):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_layers = num_layers
self.prediction
self.error
self.optimize
@lazy_property
def length(self):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
@lazy_property
def prediction(self):
# Recurrent network.
output, _ = tf.nn.dynamic_rnn(
rnn_cell.GRUCell(self._num_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length,
)
# Softmax layer.
max_length = int(self.target.get_shape()[1])
num_classes = int(self.target.get_shape()[2])
weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
# Flatten to apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
prediction = tf.reshape(prediction, [-1, max_length, num_classes])
return prediction
@lazy_property
def cost(self):
# Compute cross entropy for each frame.
cross_entropy = self.target * tf.log(self.prediction)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(cross_entropy)
@lazy_property
def optimize(self):
learning_rate = 0.0003
optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(self.cost)
@lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
mistakes = tf.cast(mistakes, tf.float32)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
mistakes *= mask
# Average over actual sequence lengths.
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(mistakes)
@staticmethod
def _weight_and_bias(in_size, out_size):
weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
bias = tf.constant(0.1, shape=[out_size])
return tf.Variable(weight), tf.Variable(bias)
#######################
#Converting file to .wav from .sph file format... God dammit!!!
#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
#train_mylist = train_filelist.read().splitlines()
#test_mylist = test_filelist.read().splitlines()
#for line in train_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_train + new_file)
#for line in test_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_test + new_file)
path_train = "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test = "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"
train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"
os.chdir(path)
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def load_sound_files(file_paths , names_input, data_input):
raw_sounds = []
names_output = []
data_output = []
class_output = []
for fp in file_paths:
X,sr = librosa.load(fp)
raw_sounds.append(X)
index = list(find_all(fp,'-'))
input_index = names_input.index(fp[index[1]+1:index[2]])
names_output.append(names_input[input_index])
data_output.append(data_input[input_index])
class_output.append(binify(data_input[input_index][0]))
return raw_sounds, names_output, data_output, class_output
def generate_list_of_names_data(file_path):
# Proprocess
# extract name and data
name = []
data = []
with open(MFCC_dir) as mfcc_feature_list:
content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
start_index_data = 0
end_index_data = 2
for number in range(0,42):
start = list(find_all(content[start_index_data],'['))[0]
end = list(find_all(content[end_index_data],']'))[0]
end_name = list(find_all(content[start_index_data],' '))[0]
substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
substring_name = content[start_index_data][:end_name]
arr = np.array(substring_data.split(), dtype = float)
data.append(arr)
name.append(substring_name)
start_index_data = start_index_data + +3
end_index_data = end_index_data +3
return name, data
files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
os.chdir(dnn_train)
train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)
max_length = 0 ## Used for variable sequence input
for element in train_data:
if element.size > max_length:
max_length = element.size
NUM_EXAMPLES = len(train_data)/2
test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]
train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##
if __name__ == '__main__':
data = tf.placeholder(tf.float32, [None, max_length, 1])
target = tf.placeholder(tf.float32, [None, 14, 1])
model = VariableSequenceLabelling(data, target)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for epoch in range(10):
for sample_set in range(100):
batch_train = train_data[sample_set]
batch_target = train_output[sample_set]
sess.run(model.optimize, {data: batch_train, target: batch_target})
test_set = test_data[epoch]
test_set_output = test_output[epoch]
error = sess.run(model.error, {data: test_set, target: test_set_output})
print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
错误信息是
Traceback (most recent call last):
File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
sess.run(model.optimize, {data: batch_train, target: batch_target})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
% (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'
据我所知,我收到的错误消息是由于使用了 max_length,并且得到的输入大小不正确 - 这意味着输入没有正确填充零? .. 还是我错了?如果是这样,我该如何解决?我寻求的解决方案似乎并非来自 tensorflow,是否有其他框架在本地执行此操作 - 由于缺少功能,是否建议使用其他框架?
Tensorflow 占位符的形状必须与提供给它们的数据的形状相匹配。在这里,您试图将 [63945] 张量提供给 [?, 138915, 1] 形状的占位符。这些是不兼容的形状和 Tensorflow 抱怨。
在将输入张量输入 Tensorflow 之前,您必须使用 numpy 将输入张量填充到所需的形状。我建议使用 numpy.pad
。 (另请注意维数必须匹配——您可以使用 numpy.reshape
来解决这个问题,或者更改占位符形状并使用 Tensorflow 重塑。)
处理长序列时,通常填充到常见的序列长度会导致内存问题。标准的解决方法是将序列分桶到相似长度的桶中。 Tensorflow seq2seq 示例可能是有用的灵感来源:https://www.tensorflow.org/versions/r0.11/tutorials/seq2seq/index.html