如何在 Keras 中使用 Tensorflow 2 数据集 API?
How to use Tensorflow 2 Dataset API with Keras?
Tensorflow 1 已回答此问题,例如:,但此回答对我的用例没有帮助。
下面是一个具有三个 float32 输入和一个 float32 输出的模型示例。我有大量数据无法一次全部放入内存,因此将其拆分为单独的文件。我正在尝试使用 Dataset API 通过一次引入一部分训练数据来训练模型。
import tensorflow as tf
import tensorflow.keras.layers as layers
import numpy as np
# Create TF model of a given architecture (number of hidden layers, layersize, #outputs, activation function)
def create_model(h=2, l=64, activation='relu'):
model = tf.keras.Sequential([
layers.Dense(l, activation=activation, input_shape=(3,), name='input_layer'),
*[layers.Dense(l, activation=activation) for _ in range(h)],
layers.Dense(1, activation='linear', name='output_layer')])
return model
# Load data (3 X variables, 1 Y variable) split into 5 files
# (for this example, just create a list 5 numpy arrays)
list_of_training_datasets = [np.random.rand(10,4).astype(np.float32) for _ in range(5)]
validation_dataset = np.random.rand(30,4).astype(np.float32)
def data_generator():
for data in list_of_training_datasets:
x_data = data[:, 0:3]
y_data = data[:, 3:4]
yield((x_data,y_data))
# prepare model
model = create_model(h=2,l=64,activation='relu')
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
# load dataset
dataset = tf.data.Dataset.from_generator(data_generator,(np.float32,np.float32))
# fit model
model.fit(dataset, epochs=100, validation_data=(validation_dataset[:,0:3],validation_dataset[:,3:4]))
运行这个,我得到错误:
ValueError: Cannot take the length of shape with unknown rank.
有人知道如何让它工作吗?我还希望能够使用批次维度,例如一次加载两个数据文件。
您需要像这样指定数据集的形状以及 return 数据类型。
dataset = tf.data.Dataset.from_generator(data_generator,
(np.float32,np.float32),
((None, 3), (None, 1)))
下面的工作,但我不知道这是不是最有效的。
据我了解,如果你的训练数据集被分成 10 个部分,那么你应该设置 steps_per_epoch=10。这确保每个 epoch 将遍历所有数据一次。据我了解, dataset.repeat()
是必需的,因为数据集迭代器在第一个纪元之后是 "used up" 。 .repeat()
确保迭代器在用完后再次创建。
import numpy as np
import tensorflow.keras.layers as layers
import tensorflow as tf
# Create TF model of a given architecture (number of hidden layers, layersize, #outputs, activation function)
def create_model(h=2, l=64, activation='relu'):
model = tf.keras.Sequential([
layers.Dense(l, activation=activation, input_shape=(3,), name='input_layer'),
*[layers.Dense(l, activation=activation) for _ in range(h)],
layers.Dense(1, activation='linear', name='output_layer')])
return model
# Load data (3 X variables, 1 Y variable) split into 5 files
# (for this example, just create a list 5 numpy arrays)
list_of_training_datasets = [np.random.rand(10,4).astype(np.float32) for _ in range(5)]
steps_per_epoch = len(list_of_training_datasets)
validation_dataset = np.random.rand(30,4).astype(np.float32)
def data_generator():
for data in list_of_training_datasets:
x_data = data[:, 0:3]
y_data = data[:, 3:4]
yield((x_data,y_data))
# prepare model
model = create_model(h=2,l=64,activation='relu')
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
# load dataset
dataset = tf.data.Dataset.from_generator(data_generator,output_types=(np.float32,np.float32),
output_shapes=(tf.TensorShape([None,3]), tf.TensorShape([None,1]))).repeat()
# fit model
model.fit(dataset.as_numpy_iterator(), epochs=10,steps_per_epoch=steps_per_epoch,
validation_data=(validation_dataset[:,0:3],validation_dataset[:,3:4]))
Tensorflow 1 已回答此问题,例如:
下面是一个具有三个 float32 输入和一个 float32 输出的模型示例。我有大量数据无法一次全部放入内存,因此将其拆分为单独的文件。我正在尝试使用 Dataset API 通过一次引入一部分训练数据来训练模型。
import tensorflow as tf
import tensorflow.keras.layers as layers
import numpy as np
# Create TF model of a given architecture (number of hidden layers, layersize, #outputs, activation function)
def create_model(h=2, l=64, activation='relu'):
model = tf.keras.Sequential([
layers.Dense(l, activation=activation, input_shape=(3,), name='input_layer'),
*[layers.Dense(l, activation=activation) for _ in range(h)],
layers.Dense(1, activation='linear', name='output_layer')])
return model
# Load data (3 X variables, 1 Y variable) split into 5 files
# (for this example, just create a list 5 numpy arrays)
list_of_training_datasets = [np.random.rand(10,4).astype(np.float32) for _ in range(5)]
validation_dataset = np.random.rand(30,4).astype(np.float32)
def data_generator():
for data in list_of_training_datasets:
x_data = data[:, 0:3]
y_data = data[:, 3:4]
yield((x_data,y_data))
# prepare model
model = create_model(h=2,l=64,activation='relu')
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
# load dataset
dataset = tf.data.Dataset.from_generator(data_generator,(np.float32,np.float32))
# fit model
model.fit(dataset, epochs=100, validation_data=(validation_dataset[:,0:3],validation_dataset[:,3:4]))
运行这个,我得到错误:
ValueError: Cannot take the length of shape with unknown rank.
有人知道如何让它工作吗?我还希望能够使用批次维度,例如一次加载两个数据文件。
您需要像这样指定数据集的形状以及 return 数据类型。
dataset = tf.data.Dataset.from_generator(data_generator,
(np.float32,np.float32),
((None, 3), (None, 1)))
下面的工作,但我不知道这是不是最有效的。
据我了解,如果你的训练数据集被分成 10 个部分,那么你应该设置 steps_per_epoch=10。这确保每个 epoch 将遍历所有数据一次。据我了解, dataset.repeat()
是必需的,因为数据集迭代器在第一个纪元之后是 "used up" 。 .repeat()
确保迭代器在用完后再次创建。
import numpy as np
import tensorflow.keras.layers as layers
import tensorflow as tf
# Create TF model of a given architecture (number of hidden layers, layersize, #outputs, activation function)
def create_model(h=2, l=64, activation='relu'):
model = tf.keras.Sequential([
layers.Dense(l, activation=activation, input_shape=(3,), name='input_layer'),
*[layers.Dense(l, activation=activation) for _ in range(h)],
layers.Dense(1, activation='linear', name='output_layer')])
return model
# Load data (3 X variables, 1 Y variable) split into 5 files
# (for this example, just create a list 5 numpy arrays)
list_of_training_datasets = [np.random.rand(10,4).astype(np.float32) for _ in range(5)]
steps_per_epoch = len(list_of_training_datasets)
validation_dataset = np.random.rand(30,4).astype(np.float32)
def data_generator():
for data in list_of_training_datasets:
x_data = data[:, 0:3]
y_data = data[:, 3:4]
yield((x_data,y_data))
# prepare model
model = create_model(h=2,l=64,activation='relu')
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
# load dataset
dataset = tf.data.Dataset.from_generator(data_generator,output_types=(np.float32,np.float32),
output_shapes=(tf.TensorShape([None,3]), tf.TensorShape([None,1]))).repeat()
# fit model
model.fit(dataset.as_numpy_iterator(), epochs=10,steps_per_epoch=steps_per_epoch,
validation_data=(validation_dataset[:,0:3],validation_dataset[:,3:4]))