Tensorflow returns ValueError 与 tf.data.Dataset 对象,但与 np.array 一起工作正常
Tensorflow returns ValueError with tf.data.Dataset object, but works fine with np.array
我正在使用这个 Kaggle 数据集开发数字分类器模型:https://www.kaggle.com/c/digit-recognizer/data?select=test.csv
用 np.array 个对象拟合模型时,它工作正常,但我无法传递 tensorflow ds 对象。这是我使用 ds 对象获取 train/validation 数据的代码:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from functools import partial
train_df = pd.read_csv('train.csv')
def prepare_data(features_df, labels_df, test_ratio=0.1, val_ratio=0.1):
features = features_df.to_numpy().reshape(features_df.shape[0], 28, 28)
features = features[..., np.newaxis]
labels = labels_df.to_numpy()
X_train, X_test, y_train, y_test = ms.train_test_split(
features,
labels,
test_size=test_ratio
)
X_train, X_valid, y_train, y_valid = ms.train_test_split(
X_train,
y_train,
test_size=val_ratio
)
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_ds = train_ds.shuffle(2048).repeat()
valid_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
valid_ds = valid_ds.shuffle(512).repeat()
test_ds = tf.data.Dataset.from_tensor_slices((
X_test,
y_test
))
return train_ds, valid_ds, test_ds
DefaultConv2D = partial(keras.layers.Conv2D,
kernel_size=4, activation='relu', padding="SAME")
model = keras.models.Sequential([
DefaultConv2D(filters=128, kernel_size=7, input_shape=[28, 28, 1]),
keras.layers.MaxPooling2D(pool_size=2),
DefaultConv2D(filters=128),
keras.layers.MaxPooling2D(pool_size=2),
DefaultConv2D(filters=256),
keras.layers.MaxPooling2D(pool_size=2),
keras.layers.Flatten(),
keras.layers.Dense(units=128, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(units=64, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(units=10, activation='softmax'),
])
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_accuracy',
verbose=1,
patience=20,
mode='max',
restore_best_weights=True
)
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(
train_ds,
epochs=100,
validation_data=valid_ds,
callbacks=[early_stopping,],
steps_per_epoch=64
)
我收到此错误消息:
ValueError: Input 0 of layer sequential_2 is incompatible with the layer: : expected min_ndim=4, found ndim=3. Full shape received: [28, 28, 1]
但是如果我将代码更改为使用 np.array 对象,它工作得很好:
test_ratio=0.1
val_ratio=0.1
features = features_df.to_numpy().reshape(features_df.shape[0], 28, 28)
features = features[..., np.newaxis]
labels = labels_df.to_numpy()
X_train, X_test, y_train, y_test = ms.train_test_split(
features,
labels,
test_size=test_ratio
)
X_train, X_valid, y_train, y_valid = ms.train_test_split(
X_train,
y_train,
test_size=val_ratio
)
history = model.fit(
X_train,
y_train,
epochs=100,
validation_data=(X_valid, y_valid),
callbacks=[early_stopping,],
steps_per_epoch=64
)
我检查了几个类似的问题,到目前为止没有任何效果。
您似乎忘记在 tf.data.Dataset
对象的末尾添加 .batch()
方法,因为您的错误是指批次维度。据我了解,创建 tf.data.Dataset
将数据集存储为类似于 python 生成器的东西,而不是将整个数据集存储在内存中。这意味着数据集的基数(数据点数)是未知的。当您在使用 tf.data.Dataset
时将一个数字传递给 steps_per_epoch
时,您的模型会使用该数字从您的数据集中获取那么多批量大小的样本。由于基数未知,因此无法提前计算批次的大小。由于您尚未对数据进行批处理,因此它将采用单独的样本。将数据创建为 numpy 数组时,您有定义数量的数据点,因此您的模型将能够计算批次的大小并使用它。
我正在使用这个 Kaggle 数据集开发数字分类器模型:https://www.kaggle.com/c/digit-recognizer/data?select=test.csv
用 np.array 个对象拟合模型时,它工作正常,但我无法传递 tensorflow ds 对象。这是我使用 ds 对象获取 train/validation 数据的代码:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from functools import partial
train_df = pd.read_csv('train.csv')
def prepare_data(features_df, labels_df, test_ratio=0.1, val_ratio=0.1):
features = features_df.to_numpy().reshape(features_df.shape[0], 28, 28)
features = features[..., np.newaxis]
labels = labels_df.to_numpy()
X_train, X_test, y_train, y_test = ms.train_test_split(
features,
labels,
test_size=test_ratio
)
X_train, X_valid, y_train, y_valid = ms.train_test_split(
X_train,
y_train,
test_size=val_ratio
)
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_ds = train_ds.shuffle(2048).repeat()
valid_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
valid_ds = valid_ds.shuffle(512).repeat()
test_ds = tf.data.Dataset.from_tensor_slices((
X_test,
y_test
))
return train_ds, valid_ds, test_ds
DefaultConv2D = partial(keras.layers.Conv2D,
kernel_size=4, activation='relu', padding="SAME")
model = keras.models.Sequential([
DefaultConv2D(filters=128, kernel_size=7, input_shape=[28, 28, 1]),
keras.layers.MaxPooling2D(pool_size=2),
DefaultConv2D(filters=128),
keras.layers.MaxPooling2D(pool_size=2),
DefaultConv2D(filters=256),
keras.layers.MaxPooling2D(pool_size=2),
keras.layers.Flatten(),
keras.layers.Dense(units=128, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(units=64, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(units=10, activation='softmax'),
])
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_accuracy',
verbose=1,
patience=20,
mode='max',
restore_best_weights=True
)
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(
train_ds,
epochs=100,
validation_data=valid_ds,
callbacks=[early_stopping,],
steps_per_epoch=64
)
我收到此错误消息:
ValueError: Input 0 of layer sequential_2 is incompatible with the layer: : expected min_ndim=4, found ndim=3. Full shape received: [28, 28, 1]
但是如果我将代码更改为使用 np.array 对象,它工作得很好:
test_ratio=0.1
val_ratio=0.1
features = features_df.to_numpy().reshape(features_df.shape[0], 28, 28)
features = features[..., np.newaxis]
labels = labels_df.to_numpy()
X_train, X_test, y_train, y_test = ms.train_test_split(
features,
labels,
test_size=test_ratio
)
X_train, X_valid, y_train, y_valid = ms.train_test_split(
X_train,
y_train,
test_size=val_ratio
)
history = model.fit(
X_train,
y_train,
epochs=100,
validation_data=(X_valid, y_valid),
callbacks=[early_stopping,],
steps_per_epoch=64
)
我检查了几个类似的问题,到目前为止没有任何效果。
您似乎忘记在 tf.data.Dataset
对象的末尾添加 .batch()
方法,因为您的错误是指批次维度。据我了解,创建 tf.data.Dataset
将数据集存储为类似于 python 生成器的东西,而不是将整个数据集存储在内存中。这意味着数据集的基数(数据点数)是未知的。当您在使用 tf.data.Dataset
时将一个数字传递给 steps_per_epoch
时,您的模型会使用该数字从您的数据集中获取那么多批量大小的样本。由于基数未知,因此无法提前计算批次的大小。由于您尚未对数据进行批处理,因此它将采用单独的样本。将数据创建为 numpy 数组时,您有定义数量的数据点,因此您的模型将能够计算批次的大小并使用它。