如何在 Tensorflow 中显示数据集对象中的 class 分布
How to show the class distribution in Dataset object in Tensorflow
我正在使用我自己的图像进行多class class化任务。
filenames = [] # a list of filenames
labels = [] # a list of labels corresponding to the filenames
full_ds = tf.data.Dataset.from_tensor_slices((filenames, labels))
这个完整的数据集将被打乱并分成训练、有效和测试数据集
full_ds_size = len(filenames)
full_ds = full_ds.shuffle(buffer_size=full_ds_size*2, seed=128) # seed is used for reproducibility
train_ds_size = int(0.64 * full_ds_size)
valid_ds_size = int(0.16 * full_ds_size)
train_ds = full_ds.take(train_ds_size)
remaining = full_ds.skip(train_ds_size)
valid_ds = remaining.take(valid_ds_size)
test_ds = remaining.skip(valid_ds_size)
现在我很难理解每个 class 在 train_ds、valid_ds 和 test_ds 中是如何分布的。一个丑陋的解决方案是迭代数据集中的所有元素并计算每个 class 的出现次数。有没有更好的办法解决?
我丑陋的解决方案:
def get_class_distribution(dataset):
class_distribution = {}
for element in dataset.as_numpy_iterator():
label = element[1]
if label in class_distribution.keys():
class_distribution[label] += 1
else:
class_distribution[label] = 0
# sort dict by key
class_distribution = collections.OrderedDict(sorted(class_distribution.items()))
return class_distribution
train_ds_class_dist = get_class_distribution(train_ds)
valid_ds_class_dist = get_class_distribution(valid_ds)
test_ds_class_dist = get_class_distribution(test_ds)
print(train_ds_class_dist)
print(valid_ds_class_dist)
print(test_ds_class_dist)
以下答案假定:
- 有五个类。
- 标签是 0 到 4 之间的整数。
可以根据您的需要进行修改。
定义一个计数器函数:
def count_class(counts, batch, num_classes=5):
labels = batch['label']
for i in range(num_classes):
cc = tf.cast(labels == i, tf.int32)
counts[i] += tf.reduce_sum(cc)
return counts
使用reduce
操作:
initial_state = dict((i, 0) for i in range(5))
counts = train_ds.reduce(initial_state=initial_state,
reduce_func=count_class)
print([(k, v.numpy()) for k, v in counts.items()])
受 user650654 的回答启发的解决方案,仅使用 TensorFlow 基元(使用 tf.unique_with_counts 而不是 for
循环):
理论上,这应该具有更好的性能,并且可以更好地扩展到大型数据集、批次或 class 计数。
num_classes = 5
@tf.function
def count_class(counts, batch):
y, _, c = tf.unique_with_counts(batch[1])
return tf.tensor_scatter_nd_add(counts, tf.expand_dims(y, axis=1), c)
counts = train_ds.reduce(
initial_state=tf.zeros(num_classes, tf.int32),
reduce_func=count_class)
print(counts.numpy())
使用 numpy 的类似且更简单的版本实际上对我的简单 use-case:
有更好的性能
count = np.zeros(num_classes, dtype=np.int32)
for images, labels in df_train:
y, _, c = tf.unique_with_counts(labels)
count[y.numpy()] += c.numpy()
print(count)
我正在使用我自己的图像进行多class class化任务。
filenames = [] # a list of filenames
labels = [] # a list of labels corresponding to the filenames
full_ds = tf.data.Dataset.from_tensor_slices((filenames, labels))
这个完整的数据集将被打乱并分成训练、有效和测试数据集
full_ds_size = len(filenames)
full_ds = full_ds.shuffle(buffer_size=full_ds_size*2, seed=128) # seed is used for reproducibility
train_ds_size = int(0.64 * full_ds_size)
valid_ds_size = int(0.16 * full_ds_size)
train_ds = full_ds.take(train_ds_size)
remaining = full_ds.skip(train_ds_size)
valid_ds = remaining.take(valid_ds_size)
test_ds = remaining.skip(valid_ds_size)
现在我很难理解每个 class 在 train_ds、valid_ds 和 test_ds 中是如何分布的。一个丑陋的解决方案是迭代数据集中的所有元素并计算每个 class 的出现次数。有没有更好的办法解决?
我丑陋的解决方案:
def get_class_distribution(dataset):
class_distribution = {}
for element in dataset.as_numpy_iterator():
label = element[1]
if label in class_distribution.keys():
class_distribution[label] += 1
else:
class_distribution[label] = 0
# sort dict by key
class_distribution = collections.OrderedDict(sorted(class_distribution.items()))
return class_distribution
train_ds_class_dist = get_class_distribution(train_ds)
valid_ds_class_dist = get_class_distribution(valid_ds)
test_ds_class_dist = get_class_distribution(test_ds)
print(train_ds_class_dist)
print(valid_ds_class_dist)
print(test_ds_class_dist)
以下答案假定:
- 有五个类。
- 标签是 0 到 4 之间的整数。
可以根据您的需要进行修改。
定义一个计数器函数:
def count_class(counts, batch, num_classes=5):
labels = batch['label']
for i in range(num_classes):
cc = tf.cast(labels == i, tf.int32)
counts[i] += tf.reduce_sum(cc)
return counts
使用reduce
操作:
initial_state = dict((i, 0) for i in range(5))
counts = train_ds.reduce(initial_state=initial_state,
reduce_func=count_class)
print([(k, v.numpy()) for k, v in counts.items()])
受 user650654 的回答启发的解决方案,仅使用 TensorFlow 基元(使用 tf.unique_with_counts 而不是 for
循环):
理论上,这应该具有更好的性能,并且可以更好地扩展到大型数据集、批次或 class 计数。
num_classes = 5
@tf.function
def count_class(counts, batch):
y, _, c = tf.unique_with_counts(batch[1])
return tf.tensor_scatter_nd_add(counts, tf.expand_dims(y, axis=1), c)
counts = train_ds.reduce(
initial_state=tf.zeros(num_classes, tf.int32),
reduce_func=count_class)
print(counts.numpy())
使用 numpy 的类似且更简单的版本实际上对我的简单 use-case:
有更好的性能count = np.zeros(num_classes, dtype=np.int32)
for images, labels in df_train:
y, _, c = tf.unique_with_counts(labels)
count[y.numpy()] += c.numpy()
print(count)