运行 Tensorflow 中的 LSTM 时出现 ResourceExhausted 错误或 OOM

Question

我正在使用以下代码在 Tensorflow 中训练我的 LSTM 网络：

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn import metrics
from sklearn.model_selection import train_test_split

%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8

RANDOM_SEED = 42

columns = ['user','activity','timestamp', 'x-axis', 'y-axis', 'z-axis']
df = pd.read_csv('data/WISDM_ar_v1.1_raw.txt', header = None, names = columns)
df = df.dropna()

df.head()

df.info()

##df['activity'].value_counts().plot(kind='bar', title='Training examples by activity type');
##df['user'].value_counts().plot(kind='bar', title='Training examples by user');

def plot_activity(activity, df):
    data = df[df['activity'] == activity][['x-axis', 'y-axis', 'z-axis']][:200]
    axis = data.plot(subplots=True, figsize=(16, 12), 
                     title=activity)
    for ax in axis:
        ax.legend(loc='lower left', bbox_to_anchor=(1.0, 0.5))


##plot_activity("Sitting", df)
##plot_activity("Standing", df)
##plot_activity("Walking", df)
##plot_activity("Jogging", df)


N_TIME_STEPS = 200
N_FEATURES = 3
step = 20
segments = []
labels = []
for i in range(0, len(df) - N_TIME_STEPS, step):
    xs = df['x-axis'].values[i: i + N_TIME_STEPS]
    ys = df['y-axis'].values[i: i + N_TIME_STEPS]
    zs = df['z-axis'].values[i: i + N_TIME_STEPS]
    label = stats.mode(df['activity'][i: i + N_TIME_STEPS])[0][0]
    segments.append([xs, ys, zs])
    labels.append(label)

np.array(segments).shape

reshaped_segments = np.asarray(segments, dtype= np.float32).reshape(-1, N_TIME_STEPS, N_FEATURES)
labels = np.asarray(pd.get_dummies(labels), dtype = np.float32)

reshaped_segments.shape
labels[0]

X_train, X_test, y_train, y_test = train_test_split(
        reshaped_segments, labels, test_size=0.2, random_state=RANDOM_SEED)

len(X_train)
len(X_test)

N_CLASSES = 6
N_HIDDEN_UNITS = 64


def create_LSTM_model(inputs):
    W = {
        'hidden': tf.Variable(tf.random_normal([N_FEATURES, N_HIDDEN_UNITS])),
        'output': tf.Variable(tf.random_normal([N_HIDDEN_UNITS, N_CLASSES]))
    }
    biases = {
        'hidden': tf.Variable(tf.random_normal([N_HIDDEN_UNITS], mean=1.0)),
        'output': tf.Variable(tf.random_normal([N_CLASSES]))
    }

    X = tf.transpose(inputs, [1, 0, 2])
    X = tf.reshape(X, [-1, N_FEATURES])
    hidden = tf.nn.relu(tf.matmul(X, W['hidden']) + biases['hidden'])
    hidden = tf.split(hidden, N_TIME_STEPS, 0)

    # Stack 2 LSTM layers
    lstm_layers = [tf.contrib.rnn.BasicLSTMCell(N_HIDDEN_UNITS, forget_bias=1.0) for _ in range(2)]
    lstm_layers = tf.contrib.rnn.MultiRNNCell(lstm_layers)

    outputs, _ = tf.contrib.rnn.static_rnn(lstm_layers, hidden, dtype=tf.float32)

    # Get output for the last time step
    lstm_last_output = outputs[-1]

    return tf.matmul(lstm_last_output, W['output']) + biases['output']


tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, N_TIME_STEPS, N_FEATURES], name="input")
Y = tf.placeholder(tf.float32, [None, N_CLASSES])


pred_Y = create_LSTM_model(X)

pred_softmax = tf.nn.softmax(pred_Y, name="y_")

loss = -tf.reduce_sum(Y * tf.log(pred_softmax))
optimizer = tf.train.GradientDescentOptimizer(learning_rate = LEARNING_RATE).minimize(loss)

correct_prediction = tf.equal(tf.argmax(pred_softmax,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

cost_history = np.empty(shape=[1],dtype=float)
saver = tf.train.Saver()

session = tf.Session()
session.run(tf.global_variables_initializer())

batch_size = 10
total_batches = X_train.shape[0] // batch_size


for epoch in range(8):
        for b in range(total_batches):    
            offset = (b * batch_size) % (y_train.shape[0] - batch_size)
            batch_x = X_train[offset:(offset + batch_size), :]
            batch_y = y_train[offset:(offset + batch_size), :]
            _, c = session.run([optimizer, loss],feed_dict={X: batch_x, Y : batch_y})
            cost_history = np.append(cost_history,c)
        print("Epoch: ",epoch," Training Loss: ",c," Training Accuracy: ",\
              session.run(accuracy, feed_dict={X: X_train, Y: y_train}))

我使用的数据集来自http://www.cis.fordham.edu/wisdm/dataset.php:

WISDM_ar_txtv1.1_raw

但是，当我运行它时，我收到 ResourceExhausted 或 OOM 错误：

Traceback (most recent call last): File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1350, in _do_call return fn(*args) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1329, in _run_fn status, run_metadata) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 473, in exit c_api.TF_GetCode(self.status.status)) tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[8784000,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape, Variable/read)]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

[[Node: add_1/_15 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_9637_add_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "", line 9, in session.run(accuracy, feed_dict={X: X_train, Y: y_train})) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 895, in run run_metadata_ptr) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1128, in _run feed_dict_tensor, options, run_metadata) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1344, in _do_run options, run_metadata) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1363, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[8784000,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape, Variable/read)]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

[[Node: add_1/_15 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_9637_add_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

Caused by op 'MatMul', defined at: File "", line 1, in File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\idlelib\run.py", line 130, in main ret = method(*args, **kwargs) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\idlelib\run.py", line 357, in runcode exec(code, self.locals) File "", line 1, in File "", line 13, in create_LSTM_model File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\math_ops.py", line 2022, in matmul a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2799, in _mat_mul name=name) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op op_def=op_def) File "C:\Users\Chaine\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[8784000,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape, Variable/read)]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

[[Node: add_1/_15 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_9637_add_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

可能导致此错误的原因是什么？

更新：我运行我的代码在另一台机器上，它没有给出错误。

Answer 1

您的代码存在重大问题。您遇到这个问题是因为您没有静态图 - 这意味着您在通过 for 循环时不断添加新图。如果您在

中跟踪评估损失值的方式

session.run([loss]),

你会发现你是运行

pred_Y = create_LSTM_model(X)

在执行 for 循环时多次执行部分代码。

你不想这样做。您应该以这样一种方式修改您的代码，即您可以从图表中提取损失参数而无需重新创建您的图表。

希望对您有所帮助。

运行 Tensorflow 中的 LSTM 时出现 ResourceExhausted 错误或 OOM

ResourceExhausted Error or OOM when running LSTM in Tensorflow

error-handling

deep-learning

tensorflow

recurrent-neural-network