聚合随机采样的列以迭代更大的 bin 大小

Question

我有一个像这样的矩阵：

mat <- matrix(c(1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,
                2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
                0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
                0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,1,
                1,1,0,0,0,0,0,0,1,0,1,2,1,0,0,0), nrow=16, ncol=6)
dimnames(mat) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
                        "p", "q", "s", "t", "u", "v","x", "z"), 
                      c("1", "2", "3", "4", "5", "6"))

我想对列进行分组或分类，然后聚合每个组的数据。对大小为 x 的 bin 重复采样 n 次。对于 x+1 的 bin 大小将重复此过程。

对于第一次迭代，两个随机列被合并。我想在不替换的情况下进行采样，这样两列的组合就不会被采样两次（但是，如果一个列与不同的列配对，则可以使用两次）。聚合被定义为计算合并列的行总和。聚合结果将作为新列添加到该 bin 大小的结果矩阵中。结果矩阵中的列数将限制为随机采样的 bin 数。

Bin 大小继续变得越来越大。对于下一次迭代，bin 大小增加到 3，以便聚合 3 个随机 selected 列。汇总数据将放入不同的结果矩阵中。这个过程将一直持续到 bin 达到数据框的大小为止，在这种情况下无法进行重采样。所有结果矩阵都将放入矩阵列表中。

下面是给定上述矩阵的前两个 bin 大小的预期结果 resultList。

# Bin size =2 
# The randomly sampled columns are columns 1&2, 2&3, 3&4, 4&5, 5&6. 
mat1 <- matrix(c(3,0,0,0,1,0,1,0,0,0,0,0,0,0,2,0,
                 2,0,1,1,2,0,0,0,0,0,0,0,0,0,1,0,
                 0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
                 0,1,0,0,1,1,0,1,0,1,1,0,0,1,0,1,
                 1,1,0,0,1,0,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat1) <- list(c("a", "c", "f", "h", "i", "j", "l", "m", 
                         "p", "q", "s", "t", "u", "v","x", "z"), 
                       c("1_2", "2_3", "3_4", "4_5", "5_6"))

# Bin size= 3
# The randomly selected columns to be joined are columns 1,2&3, 
# 2,3&4, 3,4&5, 4,5&6. 
mat2 <- matrix(c(3,0,1,1,2,0,1,0,0,0,0,0,0,0,3,0,
                 2,1,1,1,2,1,0,0,0,0,0,0,0,0,1,0,
                 0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,
                 1,2,0,0,1,1,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat2) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
                         "p", "q", "s", "t", "u", "v","x", "z"), 
                       c("1_2_3", "2_3_4", "3_4_5", "4_5_6"))
resultList <- list(mat1, mat2)

我在这里发布了一个类似的替代分箱技术问题：

这是我尝试随机合并 selected 列并将每个 bin 大小的结果放入矩阵列表中的尝试。我尝试使用 sample select j 随机列，执行 rowSums 并删除那些 selected j 配对列，这样它们就不会重复在下一次迭代中：

lapply(seq_len(ncol(mat) - 1), function(j) 
  do.call(cbind, 
          lapply(sample(ncol(mat) - j, size= ), function(i) 
            rowSums(mat[, i:(i - j)]))))

Answer 1

根据您希望在最终输出中有多少列，我们可以修改该方法，但目前这提供了所有可能的组合。

#Get column names of the matrices
all_cols <- colnames(mat)

#Select bin value from 2:ncol(mat)
total_out <- lapply(seq_len(ncol(mat))[-1], function(j) {
      #Create all combinations taking j items at a time
      temp <- combn(all_cols, j, function(x) 
              #Take rowSums for the current combination
              #Also paste column names to assign column names later
              list(rowSums(mat[, x]), paste0(x, collapse = "_")), simplify = FALSE)
      #Combine rowSums matrix 
      new_mat <- sapply(temp, `[[`, 1)
      #Assign column names
      colnames(new_mat) <- sapply(temp, `[[`, 2)
      #Return new matrix
      new_mat
})

当前输出看起来像

total_out
#[[1]]
#  1_2 1_3 1_4 1_5 1_6 2_3 2_4 2_5 2_6 3_4 3_5 3_6 4_5 4_6 5_6
#a   3   1   1   1   2   2   2   2   3   0   0   1   0   1   1
#c   0   0   1   0   1   0   1   0   1   1   0   1   1   2   1
#f   0   1   0   0   0   1   0   0   0   1   1   1   0   0   0
#h   0   1   0   0   0   1   0   0   0   1   1   1   0   0   0
#i   1   1   0   1   0   2   1   2   1   1   2   1   1   0   1
#j   0   0   1   0   0   0   1   0   0   1   0   0   1   1   0
#l   1   1   1   1   1   0   0   0   0   0   0   0   0   0   0
#m   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#p   0   0   0   0   1   0   0   0   1   0   0   1   0   1   1
#q   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#s   0   0   0   1   1   0   0   1   1   0   1   1   1   1   2
#t   0   0   0   0   2   0   0   0   2   0   0   2   0   2   2
#u   0   0   0   0   1   0   0   0   1   0   0   1   0   1   1
#v   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#x   3   2   2   2   2   1   1   1   1   0   0   0   0   0   0
#z   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#...
#....
#....
#[[5]]
#  1_2_3_4_5_6
#a           4
#c           2
#f           1
#h           1
#i           3
#j           1
#l           1
#m           1
#p           1
#q           1
#s           2
#t           2
#u           1
#v           1
#x           3
#z           1

请注意，total_out 中共有 5 (ncol - 1) 个矩阵，列数为

length(total_out)
#[1] 5

sapply(total_out, ncol)
#[1] 15 20 15  6  1

因为我们知道列表中的最后一个元素将是一个单列矩阵，所以我们可以删除它们并从剩余矩阵中删除 select 随机 nc/2 列。

total_out <- total_out[-length(total_out)]

lapply(total_out, function(x) {
     nc <- ncol(x)
     x[, sample(nc, ceiling(nc/2))]
})

Answer 2

此函数包括选项，用于选择 prob 中组合的百分比或直接选择每个列采样的最大数量 cols。

f_combos <- function(k, prob = 1L, cols = NULL) {
## k is the number of columns used in each combination

## prob is 0L < prob < 1L which is the percentage of each
## combination that will be included in the final output

## cols is the number of columns of each combination that
## will be included in the final output.

## If cols is passed, it is prioritized over any prob.


#Returns original matrix if you send k = 1 (i.e., one column) or
# will send rowSums of original matrix if k == ncols. 
  if (k == 1){
    return(mat) 
  } else if (k == ncol(mat)) { 
    return(matrix(rowSums(mat),
                  dimnames = list(rownames(mat),
                                  paste(colnames(mat),
                                        collapse = '_'))
    )
    )
  } 

 #create all unique combinations based on k columns selected
  all_combos <- combn(ncol(mat), k)


 #determines how many combos will be used
  if  (!is.null(cols)){
    n <- ifelse(cols < ncol(all_combos), cols, ncol(all_combos))
    } else if (prob < 1L) {
      n <- ceiling(prob * ncol(all_combos))
      } else {
        n <- ncol(all_combos)
        }

 #resamples the combos if necessary 
  if (n < ncol(all_combos)){
    all_combos <- all_combos[, sample(ncol(all_combos), n)]
    if ( n == 1) {
      return(matrix(rowSums(mat[, all_combos]),
                    dimnames = list(rownames(mat),
                                    paste(all_combos,
                                          collapse = '_'))
      )
      )
    } 
  }

 #this subsets the matrix all at once. 
 ##Then, array() creates a M x k x N array.  
 ## The array is then transposed with aperm() to more efficiently calculate everything
 ###with colSums
  colSums(
    aperm(
      array(
        mat[, as.vector(all_combos)],
        dim = c(nrow(mat), k, ncol(all_combos)),
        dimnames = list(rownames(mat), NULL, apply(all_combos, 2, paste0, collapse = '_'))
      ),
      perm = c(2,1,3)
    )
  )
}

此函数在很大程度上简化为（使用 RcppAlgos）：

library(RcppAlgos)
lapply(seq_len(ncol(mat)-1)[-1],
       function(k) {
         n_combos <- ceiling(comboCount(ncol(mat), k) / 2)

         all_combos <- comboSample(ncol(mat), k, n = n_combos)

         colSums(
           aperm(
             array(
               mat[, as.vector(t(all_combos))],
               dim = c(nrow(mat), k, nrow(all_combos)),
               dimnames = list(rownames(mat), NULL, apply(all_combos, 1, paste0, collapse = '_'))
             ),
             perm = c(2,1,3)
           )
         )
       }
)

该函数适用于：

lapply(seq_len(ncol(mat)), f_combos)

如果包含f_combos(k = 1)，第一个列表输出将是原始矩阵。所有其他 returns 将基于可选参数 prob 和 cols.

实际操作：仅获取 1 列：

lapply(seq_len(ncol(mat)-1)[-1], f_combos, , 1)
#all truncated to the first 3 rows
[[1]]
  2_5
a   2
c   0
f   0

[[2]]
  1_2_4
a     3
c     1
f     0

[[3]]
  1_2_3_4
a       3
c       1
f       1

[[4]]
  1_2_3_4_5
a         3
c         1
f         1

一半的列：

lapply(seq_len(ncol(mat)), f_combos, 0.5)
#all truncated to the first 3 rows
[[1]]
  1_5 1_2 1_3 3_5 3_4 5_6 2_3 2_6
a   1   3   1   0   0   1   2   3
c   0   0   0   0   1   1   0   1
f   0   0   1   1   1   0   1   0

[[2]]
  1_4_5 2_3_5 1_5_6 3_5_6 1_2_4 2_3_4 2_5_6 2_4_5 1_3_6 1_4_6
a     1     2     2     1     3     2     3     2     2     2
c     1     0     1     1     1     1     1     1     1     2
f     0     1     0     1     0     1     0     0     1     0

[[3]]
  1_4_5_6 1_2_4_5 2_3_5_6 1_2_5_6 1_2_4_6 1_2_3_4 2_4_5_6 1_2_3_6
a       2       3       3       4       4       3       3       4
c       2       1       1       1       2       1       2       1
f       0       0       1       0       0       1       0       1

[[4]]
  2_3_4_5_6 1_2_3_5_6 1_2_4_5_6
a         3         4         4
c         2         1         2
f         1         1         0

Answer 3

您正在查找大小 x 的 ncol(mat) 的所有组合 cx，其中 x 从 2 增长到 ncol(mat) - 1。然后，您需要样本 cx.s 个大小为每个样本 cx 一半的样本。从 mat 开始，您希望每个 cx.s 表示的列的 rowSums() 每个 x.

此解决方案使用 RcppAlgos::comboGeneral()，比 utils::combn() 快得多。

library(RcppAlgos)
set.seed(42)  ## for sake of reproducibility
res <- lapply(2L:(ncol(mat) - 1), function(x) {
  ## matrix 'cx' of all combinations of size 'x'
  cx <- comboGeneral(ncol(mat), x)
  ## sample of 'cx' of size 'ncol(cx)/2' (automatically adjusts downwards)
  cx.s <- cx[sort(sample(nrow(cx), nrow(cx)/2)), ]
  ## apply aggregation on sampled columns of 'mat'
  out <- lapply(1L:nrow(cx.s), function(k) {
    ## sample 'mat.cx.k' of 'mat'
    mat.cx.k <- mat[, cx.s[k, ]]
    ## apply aggregation function and
    ## set attribute 's.cols' with the sampled col numbers
    return(`attr<-`(
      rowSums(mat.cx.k),  ## <-- aggregation function here
      "s.cols", paste(cx.s[k, ], collapse="_")))
  })
  ## cbind to matrix and set colnames from attributes
  return(`colnames<-`(do.call(cbind, out), Map(attr, out, "s.cols")))
})

更新（重新更新）

正如我们从 @Cole 中了解到的那样，RcppAlgos 包含一个 comboSample() 函数，该函数已经处理了组合的采样，因此我们能够编码更有效和简洁！

我从您对的评论中了解到，您正在处理 600 列，使用 "sampling-half-of-possible-combinations-approach"...

comboCount(600, 600/2)
# Big Integer ('bigz') :
#   [1] 135107941996194268514474877978504530397233945449193479925965721786474150408005716961950480198274469818673334131365837249043900490761151591695308427048536947621976068789875968372656

...因此，我在函数中实现了一种带有 max.comb 的安全性-switch()。

FUN <- function(mat, max.comb=20) {
  out <- lapply(2L:(ncol(mat) - 1), function(x) {
    ## all combinations of 'ncol(mat)' of size 'x' with half size (adjusts downwards)
    do.call(cbind, {
      n.comb <- comboCount(ncol(mat), x)/2L
      comboSample(ncol(mat), x, n=switch((n.comb > max.comb)+1, floor(n.comb), max.comb),
                  FUN=function(i) 
                    `colnames<-`(as.matrix(
                      rowSums(mat[, i])),  ## <-- aggregation FUN here
                      paste(i, collapse="_")))
      })
  })
  # out <- Map(function(x) x[, order(colnames(x))],  # un-comment for ordered columns
  #            out)                                  # (slower)
  return(out)
} 

set.seed(42)  ## for sake of reproducibility
res2 <- FUN(mat, max.comb=20)

stopifnot(all.equal(res, res2))  # with ordered columns

结果

res
# [[1]]
#   1_2 1_3 1_5 1_6 2_6 3_4 5_6
# a   3   1   1   2   3   0   1
# c   0   0   0   1   1   1   1
# f   0   1   0   0   0   1   0
# h   0   1   0   0   0   1   0
# i   1   1   1   0   1   1   1
# j   0   0   0   0   0   1   0
# l   1   1   1   1   0   0   0
# m   0   0   1   0   0   0   1
# p   0   0   0   1   1   0   1
# q   0   0   1   0   0   0   1
# s   0   0   1   1   1   0   2
# t   0   0   0   2   2   0   2
# u   0   0   0   1   1   0   1
# v   0   0   1   0   0   0   1
# x   3   2   2   2   1   0   0
# z   0   0   1   0   0   0   1
# 
# [[2]]
#   1_2_4 1_2_6 1_3_4 1_3_6 1_4_6 1_5_6 2_4_5 2_4_6 3_4_5 4_5_6
# a     3     4     1     2     2     2     2     3     0     1
# c     1     1     1     1     2     1     1     2     1     2
# f     0     0     1     1     0     0     0     0     1     0
# h     0     0     1     1     0     0     0     0     1     0
# i     1     1     1     1     0     1     2     1     2     1
# j     1     0     1     0     1     0     1     1     1     1
# l     1     1     1     1     1     1     0     0     0     0
# m     0     0     0     0     0     1     1     0     1     1
# p     0     1     0     1     1     1     0     1     0     1
# q     0     0     0     0     0     1     1     0     1     1
# s     0     1     0     1     1     2     1     1     1     2
# t     0     2     0     2     2     2     0     2     0     2
# u     0     1     0     1     1     1     0     1     0     1
# v     0     0     0     0     0     1     1     0     1     1
# x     3     3     2     2     2     2     1     1     0     0
# z     0     0     0     0     0     1     1     0     1     1
# 
# [[3]]
#   1_2_3_6 1_2_4_5 1_2_4_6 1_3_5_6 2_3_4_5 2_3_5_6 3_4_5_6
# a       4       3       4       2       2       3       1
# c       1       1       2       1       1       1       2
# f       1       0       0       1       1       1       1
# h       1       0       0       1       1       1       1
# i       2       2       1       2       3       3       2
# j       0       1       1       0       1       0       1
# l       1       1       1       1       0       0       0
# m       0       1       0       1       1       1       1
# p       1       0       1       1       0       1       1
# q       0       1       0       1       1       1       1
# s       1       1       1       2       1       2       2
# t       2       0       2       2       0       2       2
# u       1       0       1       1       0       1       1
# v       0       1       0       1       1       1       1
# x       3       3       3       2       1       1       0
# z       0       1       0       1       1       1       1
# 
# [[4]]
#   1_2_4_5_6 1_3_4_5_6 2_3_4_5_6
# a         4         2         3
# c         2         2         2
# f         0         1         1
# h         0         1         1
# i         2         2         3
# j         1         1         1
# l         1         1         0
# m         1         1         1
# p         1         1         1
# q         1         1         1
# s         2         2         2
# t         2         2         2
# u         1         1         1
# v         1         1         1
# x         3         2         1
# z         1         1         1

在 16x600 矩阵上测试给出了非常有用的时序结果：

system.time(FUN(mat[, sample(ncol(mat), 600, replace=TRUE)], max.comb=20))
# user  system elapsed 
# 2.91    0.00    2.95

@Combs array 方法可能会带来额外的 % 性能。

如有必要，您还可以将未抽样的mat的行总和与append()相加。

res <- append(res, 
              list(`colnames<-`(as.matrix(rowSums(mat)), paste(1:ncol(mat), collapse="_"))))

数据

mat <- structure(c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0), .Dim = c(16L, 
6L), .Dimnames = list(c("a", "c", "f", "h", "i", "j", "l", "m", 
"p", "q", "s", "t", "u", "v", "x", "z"), c("1", "2", "3", "4", 
"5", "6")))

聚合随机采样的列以迭代更大的 bin 大小

Aggregate randomly sampled columns for iteratively larger bin sizes

r

sample

lapply

更新（重新更新）

结果

数据