我认为零填充未正确执行

Zero padding not performed properly I think

我在使用提供的实现时似乎遇到了一些问题 here。 我和发帖的人有点相似的情况,我正在尝试映射和输入到输出。输入是音频文件的样本,输出是长度为 14(长度是静态的)的特征向量。序列长度是可变的,因为音频文件的长度不同,使得包含样本的向量也变得不同长度。

我不是在解决分类问题,而是在解决回归问题,所以任务有点不同。

我的代码如下所示:

import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools

start_time = time.time()

print "Preprocessing"

def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper
## Class definition ##
class VariableSequenceLabelling:

    def __init__(self, data, target, num_hidden=200, num_layers=3):
        self.data = data
        self.target = target
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property
    def length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    @lazy_property
    def prediction(self):
        # Recurrent network.
        output, _ = tf.nn.dynamic_rnn(
             rnn_cell.GRUCell(self._num_hidden),
            self.data,
            dtype=tf.float32,
            sequence_length=self.length,
        )
        # Softmax layer.
        max_length = int(self.target.get_shape()[1])
        num_classes = int(self.target.get_shape()[2])
        weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
        # Flatten to apply same weights to all time steps.
        output = tf.reshape(output, [-1, self._num_hidden])
        prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
        prediction = tf.reshape(prediction, [-1, max_length, num_classes])
        return prediction

    @lazy_property
    def cost(self):
        # Compute cross entropy for each frame.
        cross_entropy = self.target * tf.log(self.prediction)
        cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
        mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
        cross_entropy *= mask
        # Average over actual sequence lengths.
        cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
        cross_entropy /= tf.cast(self.length, tf.float32)
        return tf.reduce_mean(cross_entropy)

    @lazy_property
    def optimize(self):
        learning_rate = 0.0003
        optimizer = tf.train.AdamOptimizer(learning_rate)
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
        mistakes = tf.cast(mistakes, tf.float32)
        mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
        mistakes *= mask
        # Average over actual sequence lengths.
        mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
        mistakes /= tf.cast(self.length, tf.float32)
        return tf.reduce_mean(mistakes)

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

#######################
#Converting file to .wav from .sph file format... God dammit!!!

#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
    #train_mylist = train_filelist.read().splitlines()
    #test_mylist = test_filelist.read().splitlines()
    #for line in train_mylist:
        #new_line = ' '.join(reversed(line))
        #index_start = new_line.find('h')
        #index_end = new_line.find('/')
        #edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
        #new_file = edited_line + 'wav'
        #os.system(line + ' >> ' + dnn_train + new_file)
    #for line in test_mylist:
        #new_line = ' '.join(reversed(line))
        #index_start = new_line.find('h')
        #index_end = new_line.find('/')
        #edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
        #new_file = edited_line + 'wav'
        #os.system(line + ' >> ' + dnn_test + new_file)


path_train =  "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test =  "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path  = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"

train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"

os.chdir(path)

def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches

def load_sound_files(file_paths ,  names_input, data_input):
    raw_sounds = []
    names_output = []
    data_output = []
    class_output = []
    for fp in file_paths:
        X,sr = librosa.load(fp)
        raw_sounds.append(X)
        index = list(find_all(fp,'-'))
        input_index = names_input.index(fp[index[1]+1:index[2]])
        names_output.append(names_input[input_index])
        data_output.append(data_input[input_index])
        class_output.append(binify(data_input[input_index][0]))
    return raw_sounds, names_output, data_output, class_output

def generate_list_of_names_data(file_path):
    # Proprocess
    # extract name and data
    name = []
    data = []
    with open(MFCC_dir) as mfcc_feature_list:
        content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
        start_index_data = 0
        end_index_data = 2
        for number in range(0,42):
            start = list(find_all(content[start_index_data],'['))[0]
            end = list(find_all(content[end_index_data],']'))[0]
            end_name = list(find_all(content[start_index_data],' '))[0]
            substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
            substring_name = content[start_index_data][:end_name]
            arr = np.array(substring_data.split(), dtype = float)
            data.append(arr)
            name.append(substring_name)
            start_index_data = start_index_data + +3
            end_index_data = end_index_data +3
    return name, data

files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]

files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]

os.chdir(dnn_train)

train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)

max_length = 0 ## Used for variable sequence input

for element in train_data:
    if element.size > max_length:
        max_length = element.size

NUM_EXAMPLES = len(train_data)/2

test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]

train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##

if __name__ == '__main__':
    data = tf.placeholder(tf.float32, [None, max_length, 1])
    target = tf.placeholder(tf.float32, [None, 14, 1])
    model = VariableSequenceLabelling(data, target)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    for epoch in range(10):
        for sample_set in range(100):
            batch_train = train_data[sample_set]
            batch_target = train_output[sample_set]
            sess.run(model.optimize, {data: batch_train, target: batch_target})
        test_set = test_data[epoch]
        test_set_output = test_output[epoch]
        error = sess.run(model.error, {data: test_set, target: test_set_output})
        print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))

错误信息是

  Traceback (most recent call last):
      File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
        sess.run(model.optimize, {data: batch_train, target: batch_target})
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
        run_metadata_ptr)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
        % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
    ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'

据我所知,我收到的错误消息是由于使用了 max_length,并且得到的输入大小不正确 - 这意味着输入没有正确填充零? .. 还是我错了?如果是这样,我该如何解决?我寻求的解决方案似乎并非来自 tensorflow,是否有其他框架在本地执行此操作 - 由于缺少功能,是否建议使用其他框架?

Tensorflow 占位符的形状必须与提供给它们的数据的形状相匹配。在这里,您试图将 [63945] 张量提供给 [?, 138915, 1] 形状的占位符。这些是不兼容的形状和 Tensorflow 抱怨。

在将输入张量输入 Tensorflow 之前,您必须使用 numpy 将输入张量填充到所需的形状。我建议使用 numpy.pad。 (另请注意维数必须匹配——您可以使用 numpy.reshape 来解决这个问题,或者更改占位符形状并使用 Tensorflow 重塑。)

处理长序列时,通常填充到常见的序列长度会导致内存问题。标准的解决方法是将序列分桶到相似长度的桶中。 Tensorflow seq2seq 示例可能是有用的灵感来源:https://www.tensorflow.org/versions/r0.11/tutorials/seq2seq/index.html