如何从make_csv_dataset获取编码器?
How to obtain the encoder from the make_csv_dataset?
我使用了教程中的这段代码:
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
然后我创建了训练集:
raw_train_data = get_train_dataset(train_file_path)
训练模型。
问题是如何获取用于编码新文本的训练数据的编码器?
我加载了新数据,但这并没有使用与训练数据相同的编码器:
raw_test_data = get_test_dataset(new_data_file_path)
使用tf.data.experimental.make_csv_dataset时如何获取原码?
编辑:
train_file_path = "./train.csv"
test_file_path = "./test.csv"
LABEL_COLUMN = 'target'
CSV_COLUMNS = ['text', 'target']
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
def get_test_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
sample_submission = pd.read_csv("./sample_submission.csv")
raw_train_data = get_train_dataset(train_file_path)
raw_test_data = get_test_dataset(test_file_path)
def extract_train_tensor(example, label):
print(example)
return example['text'], label
def extract_test_tensor(example):
print(example)
return example['text']
test_data = raw_test_data.map(lambda ex: extract_test_tensor(ex))
test_data_size = len(list(test_data))
print("test size: ", test_data_size)
train_data_all = raw_train_data.map(lambda ex, label: extract_train_tensor(ex, label))
train_data_all = train_data_all.shuffle(10000)
print(train_data_all)
train_data_size = len(list(train_data_all))
print("train size: ", train_data_size)
train_size = int(0.7 * train_data_size)
val_size = int(0.3 * train_data_size)
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
dtype=tf.string, trainable=True)
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_data,
epochs=20,
validation_data=val_data,
verbose=1)
import numpy as np
predictions = model.predict(test_data)
predictions = np.where(predictions > 0.5, 1, 0)
sample_submission['target'] = predictions
print(predictions)
两次调用 get_train_dataset() 和 get_test_dataset() 生成训练和测试数据。训练数据分为训练集和验证集,准确性很高。但是,测试数据的准确性很低。两个数据集都是文本字符串,我没有做任何编码。
tf.data.experimental.make_csv_dataset不做任何编码。
它是关于:
Reads CSV files into a dataset, where each element is a (features,
labels) tuple that corresponds to a batch of CSV rows. The features
dictionary maps feature column names to Tensors containing the
corresponding feature data, and labels is a Tensor containing the
batch's label data.
所以你的 get_test_dataset()
函数不应该关心 get_train_dataset()
函数数据集生成过程。
关于低测试性能:
您使用重复随机组合相同数据的样本来训练和验证您的模型:
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
因此您可能会有相同的样本,因此您的验证不会导致对模型准确性的真正测量。
相反,该模型从未见过测试集的样本,因此对该集的预测是对性能的可靠衡量。
我使用了教程中的这段代码:
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
然后我创建了训练集:
raw_train_data = get_train_dataset(train_file_path)
训练模型。
问题是如何获取用于编码新文本的训练数据的编码器?
我加载了新数据,但这并没有使用与训练数据相同的编码器:
raw_test_data = get_test_dataset(new_data_file_path)
使用tf.data.experimental.make_csv_dataset时如何获取原码?
编辑:
train_file_path = "./train.csv"
test_file_path = "./test.csv"
LABEL_COLUMN = 'target'
CSV_COLUMNS = ['text', 'target']
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
def get_test_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
sample_submission = pd.read_csv("./sample_submission.csv")
raw_train_data = get_train_dataset(train_file_path)
raw_test_data = get_test_dataset(test_file_path)
def extract_train_tensor(example, label):
print(example)
return example['text'], label
def extract_test_tensor(example):
print(example)
return example['text']
test_data = raw_test_data.map(lambda ex: extract_test_tensor(ex))
test_data_size = len(list(test_data))
print("test size: ", test_data_size)
train_data_all = raw_train_data.map(lambda ex, label: extract_train_tensor(ex, label))
train_data_all = train_data_all.shuffle(10000)
print(train_data_all)
train_data_size = len(list(train_data_all))
print("train size: ", train_data_size)
train_size = int(0.7 * train_data_size)
val_size = int(0.3 * train_data_size)
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
dtype=tf.string, trainable=True)
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_data,
epochs=20,
validation_data=val_data,
verbose=1)
import numpy as np
predictions = model.predict(test_data)
predictions = np.where(predictions > 0.5, 1, 0)
sample_submission['target'] = predictions
print(predictions)
两次调用 get_train_dataset() 和 get_test_dataset() 生成训练和测试数据。训练数据分为训练集和验证集,准确性很高。但是,测试数据的准确性很低。两个数据集都是文本字符串,我没有做任何编码。
tf.data.experimental.make_csv_dataset不做任何编码。 它是关于:
Reads CSV files into a dataset, where each element is a (features, labels) tuple that corresponds to a batch of CSV rows. The features dictionary maps feature column names to Tensors containing the corresponding feature data, and labels is a Tensor containing the batch's label data.
所以你的 get_test_dataset()
函数不应该关心 get_train_dataset()
函数数据集生成过程。
关于低测试性能:
您使用重复随机组合相同数据的样本来训练和验证您的模型:
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
因此您可能会有相同的样本,因此您的验证不会导致对模型准确性的真正测量。
相反,该模型从未见过测试集的样本,因此对该集的预测是对性能的可靠衡量。