基于内部值的 Numpy 数组操作

Question

我正在尝试完成一项奇怪的任务。我需要在不使用 sklearn 的情况下完成以下操作，最好使用 numpy:

给定一个数据集，将数据分成 5 个相等的 "folds" 或分区
在每个分区中，将数据拆分为 "training" 和 "testing" 组，按 80/20 拆分
关键在于：您的数据集被标记为 classes。因此，以具有 100 个实例的数据集为例，class A 具有 33 个样本，class B 具有 67 个样本。我应该创建 5 个折叠的 20 个数据实例，在每个折叠中，class A 有 6 或 7 (1/3) 个值，class B 有其余的

我的问题是： 我不知道如何正确地return每个折叠的测试和训练集，尽管能够适当地拆分它，而且，更重要的是，我不知道如何正确划分每个 class.

的元素数量

我当前的代码在这里。在我卡住的地方评论说：

import numpy

def csv_to_array(file):
    # Open the file, and load it in delimiting on the ',' for a comma separated value file
    data = open(file, 'r')
    data = numpy.loadtxt(data, delimiter=',')

    # Loop through the data in the array
    for index in range(len(data)):
        # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
        try:
            data[index] = [float(x) for x in data[index]]
        except Exception:
            data[index] = 0
        except ValueError:
            data[index] = 0

    # Return the now type-formatted data
    return data

def five_cross_fold_validation(dataset):
    # print("DATASET", dataset)
    numpy.random.shuffle(dataset)
    num_rows = dataset.shape[0]
    split_mark = int(num_rows / 5)
    folds = []
    temp1 = dataset[:split_mark]
    # print("TEMP1", temp1)
    temp2 = dataset[split_mark:split_mark*2]
    # print("TEMP2", temp2)
    temp3 = dataset[split_mark*2:split_mark*3]
    # print("TEMP3", temp3)
    temp4 = dataset[split_mark*3:split_mark*4]
    # print("TEMP4", temp4)
    temp5 = dataset[split_mark*4:]
    # print("TEMP5", temp5)
    folds.append(temp1)
    folds.append(temp2)
    folds.append(temp3)
    folds.append(temp4)
    folds.append(temp5)
    # folds = numpy.asarray(folds)

    for fold in folds:
        # fold = numpy.asarray(fold)
        num_rows = fold.shape[0]
        split_mark = int(num_rows * .8)

        fold_training = fold[split_mark:]
        fold_testing = fold[:split_mark]

        print(type(fold))
        # fold.tolist()
        list(fold)
        print(type(fold))
        del fold[0:len(fold)]
        fold.append(fold_training)
        fold.append(fold_testing)
        fold = numpy.asarray(fold)




        # Somehow, return a testing and training set within each fold

    # print(folds)

    return folds

def confirm_size(folds):
    total = 0
    for fold in folds:
        curr = len(fold)
        total = total + curr
    return total


def main():
    print("BEGINNING CFV")
    ecoli = csv_to_array('Classification/ecoli.csv')
    print(len(ecoli))
    folds = five_cross_fold_validation(ecoli)
    size = confirm_size(folds)
    print(size)

main()

此外，作为参考，我附上了我正在使用的 csv（它是 UCI Ecoli Dataset 的修改版）。这里的 classes 是最后一列中的值。所以 0, 1, 2, 3, 4. 重要的是要注意每个 class.

的数量不相等

        0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
        0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
        0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
        0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
        0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
        0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
        0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
        0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
        0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
        0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
        0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
        0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
        0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
        0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
        0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
        0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
        0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
        0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
        0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
        0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
        0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
        0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
        0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
        0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
        0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
        0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
        0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
        0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
        0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
        0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
        0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
        0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
        0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
        0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
        0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
        0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
        0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
        0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
        0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
        0,0.38,0.48,0.5,0.42,0.48,0.55,0
        0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
        0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
        0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
        0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
        0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
        0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
        0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
        0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
        0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
        0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
        0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
        0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
        0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
        0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
        0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
        0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
        0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
        0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
        0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
        0.27,0.35,0.48,0.5,0.51,0.77,0.79,1

Answer 1

Edit 我用 A = np.random.permutation(A) 替换了 np.random.shuffle(A)，唯一的区别是它不会改变输入数组。这对这段代码没有任何影响，但总的来说更安全。

想法是使用 numpy.random.permutation 对输入进行随机抽样。一旦行被打乱，我们只需要遍历所有可能的测试集（滑动 window 所需的大小，这里是输入大小的 20%）。对应的训练集刚好由所有剩余元素组成。

这将保留所有子集上的原始类分布，即使我们按顺序选择它们也是如此，因为我们打乱了输入。

以下代码遍历 test/train 组组合：

import numpy as np

def csv_to_array(file):
  with open(file, 'r') as f:
    data = np.loadtxt(f, delimiter=',')
  return data

def classes_distribution(A):
  """Print the class distributions of array A."""
  nb_classes = np.unique(A[:,-1]).shape[0]
  total_size = A.shape[0]
  for i in range(nb_classes):
    class_size = sum(row[-1] == i for row in A)
    class_p = class_size/total_size
    print(f"\t P(class_{i}) = {class_p:.3f}")

def random_samples(A, test_set_p=0.2):
  """Split the input array A in two uniformly chosen 
  random sets: test/training.
  Repeat this until all rows have been yielded once at least 
  once as a test set."""
  A = np.random.permutation(A)
  sample_size = int(test_set_p*A.shape[0])
  for start in range(0, A.shape[0], sample_size):
    end = start + sample_size
    yield {
      "test": A[start:end,], 
      "train": np.append(A[:start,], A[end:,], 0)
    }

def main():
  ecoli = csv_to_array('ecoli.csv')
  print("Input set shape: ", ecoli.shape)
  print("Input set class distribution:")
  classes_distribution(ecoli)
  print("Training sets class distributions:")
  for iteration in random_samples(ecoli):
    test_set = iteration["test"]
    training_set = iteration["train"]
    classes_distribution(training_set)
    print("---")
    # ... Do what ever with these two sets

main()

它产生以下形式的输出：

Input set shape:  (169, 8)
Input set class distribution:
     P(class_0) = 0.308
     P(class_1) = 0.213
     P(class_2) = 0.207
     P(class_3) = 0.118
     P(class_4) = 0.154
Training sets class distributions:
     P(class_0) = 0.316
     P(class_1) = 0.206
     P(class_2) = 0.199
     P(class_3) = 0.118
     P(class_4) = 0.162
...

基于内部值的 Numpy 数组操作

Numpy Array Manipulation Based off of Internal Values

python

numpy

machine-learning

python-3.x

numpy-ndarray