聚合随机采样的列以迭代更大的 bin 大小
Aggregate randomly sampled columns for iteratively larger bin sizes
我有一个像这样的矩阵:
mat <- matrix(c(1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,
2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,0,0,0,0,1,0,1,2,1,0,0,0), nrow=16, ncol=6)
dimnames(mat) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v","x", "z"),
c("1", "2", "3", "4", "5", "6"))
我想对列进行分组或分类,然后聚合每个组的数据。对大小为 x 的 bin 重复采样 n 次。对于 x+1 的 bin 大小将重复此过程。
对于第一次迭代,两个随机列被合并。我想在不替换的情况下进行采样,这样两列的组合就不会被采样两次(但是,如果一个列与不同的列配对,则可以使用两次)。聚合被定义为计算合并列的行总和。聚合结果将作为新列添加到该 bin 大小的结果矩阵中。结果矩阵中的列数将限制为随机采样的 bin 数。
Bin 大小继续变得越来越大。对于下一次迭代,bin 大小增加到 3,以便聚合 3 个随机 selected 列。汇总数据将放入不同的结果矩阵中。这个过程将一直持续到 bin 达到数据框的大小为止,在这种情况下无法进行重采样。所有结果矩阵都将放入矩阵列表中。
下面是给定上述矩阵的前两个 bin 大小的预期结果 resultList
。
# Bin size =2
# The randomly sampled columns are columns 1&2, 2&3, 3&4, 4&5, 5&6.
mat1 <- matrix(c(3,0,0,0,1,0,1,0,0,0,0,0,0,0,2,0,
2,0,1,1,2,0,0,0,0,0,0,0,0,0,1,0,
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,1,1,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,1,0,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat1) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v","x", "z"),
c("1_2", "2_3", "3_4", "4_5", "5_6"))
# Bin size= 3
# The randomly selected columns to be joined are columns 1,2&3,
# 2,3&4, 3,4&5, 4,5&6.
mat2 <- matrix(c(3,0,1,1,2,0,1,0,0,0,0,0,0,0,3,0,
2,1,1,1,2,1,0,0,0,0,0,0,0,0,1,0,
0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,
1,2,0,0,1,1,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat2) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v","x", "z"),
c("1_2_3", "2_3_4", "3_4_5", "4_5_6"))
resultList <- list(mat1, mat2)
我在这里发布了一个类似的替代分箱技术问题:
这是我尝试随机合并 selected 列并将每个 bin 大小的结果放入矩阵列表中的尝试。我尝试使用 sample
select j
随机列,执行 rowSums
并删除那些 selected j
配对列,这样它们就不会重复在下一次迭代中:
lapply(seq_len(ncol(mat) - 1), function(j)
do.call(cbind,
lapply(sample(ncol(mat) - j, size= ), function(i)
rowSums(mat[, i:(i - j)]))))
根据您希望在最终输出中有多少列,我们可以修改该方法,但目前这提供了所有可能的组合。
#Get column names of the matrices
all_cols <- colnames(mat)
#Select bin value from 2:ncol(mat)
total_out <- lapply(seq_len(ncol(mat))[-1], function(j) {
#Create all combinations taking j items at a time
temp <- combn(all_cols, j, function(x)
#Take rowSums for the current combination
#Also paste column names to assign column names later
list(rowSums(mat[, x]), paste0(x, collapse = "_")), simplify = FALSE)
#Combine rowSums matrix
new_mat <- sapply(temp, `[[`, 1)
#Assign column names
colnames(new_mat) <- sapply(temp, `[[`, 2)
#Return new matrix
new_mat
})
当前输出看起来像
total_out
#[[1]]
# 1_2 1_3 1_4 1_5 1_6 2_3 2_4 2_5 2_6 3_4 3_5 3_6 4_5 4_6 5_6
#a 3 1 1 1 2 2 2 2 3 0 0 1 0 1 1
#c 0 0 1 0 1 0 1 0 1 1 0 1 1 2 1
#f 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0
#h 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0
#i 1 1 0 1 0 2 1 2 1 1 2 1 1 0 1
#j 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0
#l 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
#m 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#p 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1
#q 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#s 0 0 0 1 1 0 0 1 1 0 1 1 1 1 2
#t 0 0 0 0 2 0 0 0 2 0 0 2 0 2 2
#u 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1
#v 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#x 3 2 2 2 2 1 1 1 1 0 0 0 0 0 0
#z 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#...
#....
#....
#[[5]]
# 1_2_3_4_5_6
#a 4
#c 2
#f 1
#h 1
#i 3
#j 1
#l 1
#m 1
#p 1
#q 1
#s 2
#t 2
#u 1
#v 1
#x 3
#z 1
请注意,total_out
中共有 5 (ncol - 1
) 个矩阵,列数为
length(total_out)
#[1] 5
sapply(total_out, ncol)
#[1] 15 20 15 6 1
因为我们知道列表中的最后一个元素将是一个单列矩阵,所以我们可以删除它们并从剩余矩阵中删除 select 随机 nc/2
列。
total_out <- total_out[-length(total_out)]
lapply(total_out, function(x) {
nc <- ncol(x)
x[, sample(nc, ceiling(nc/2))]
})
此函数包括选项,用于选择 prob
中组合的百分比或直接选择每个列采样的最大数量 cols
。
f_combos <- function(k, prob = 1L, cols = NULL) {
## k is the number of columns used in each combination
## prob is 0L < prob < 1L which is the percentage of each
## combination that will be included in the final output
## cols is the number of columns of each combination that
## will be included in the final output.
## If cols is passed, it is prioritized over any prob.
#Returns original matrix if you send k = 1 (i.e., one column) or
# will send rowSums of original matrix if k == ncols.
if (k == 1){
return(mat)
} else if (k == ncol(mat)) {
return(matrix(rowSums(mat),
dimnames = list(rownames(mat),
paste(colnames(mat),
collapse = '_'))
)
)
}
#create all unique combinations based on k columns selected
all_combos <- combn(ncol(mat), k)
#determines how many combos will be used
if (!is.null(cols)){
n <- ifelse(cols < ncol(all_combos), cols, ncol(all_combos))
} else if (prob < 1L) {
n <- ceiling(prob * ncol(all_combos))
} else {
n <- ncol(all_combos)
}
#resamples the combos if necessary
if (n < ncol(all_combos)){
all_combos <- all_combos[, sample(ncol(all_combos), n)]
if ( n == 1) {
return(matrix(rowSums(mat[, all_combos]),
dimnames = list(rownames(mat),
paste(all_combos,
collapse = '_'))
)
)
}
}
#this subsets the matrix all at once.
##Then, array() creates a M x k x N array.
## The array is then transposed with aperm() to more efficiently calculate everything
###with colSums
colSums(
aperm(
array(
mat[, as.vector(all_combos)],
dim = c(nrow(mat), k, ncol(all_combos)),
dimnames = list(rownames(mat), NULL, apply(all_combos, 2, paste0, collapse = '_'))
),
perm = c(2,1,3)
)
)
}
此函数在很大程度上简化为(使用 RcppAlgos
):
library(RcppAlgos)
lapply(seq_len(ncol(mat)-1)[-1],
function(k) {
n_combos <- ceiling(comboCount(ncol(mat), k) / 2)
all_combos <- comboSample(ncol(mat), k, n = n_combos)
colSums(
aperm(
array(
mat[, as.vector(t(all_combos))],
dim = c(nrow(mat), k, nrow(all_combos)),
dimnames = list(rownames(mat), NULL, apply(all_combos, 1, paste0, collapse = '_'))
),
perm = c(2,1,3)
)
)
}
)
该函数适用于:
lapply(seq_len(ncol(mat)), f_combos)
如果包含f_combos(k = 1)
,第一个列表输出将是原始矩阵。所有其他 returns 将基于可选参数 prob
和 cols
.
实际操作:仅获取 1 列:
lapply(seq_len(ncol(mat)-1)[-1], f_combos, , 1)
#all truncated to the first 3 rows
[[1]]
2_5
a 2
c 0
f 0
[[2]]
1_2_4
a 3
c 1
f 0
[[3]]
1_2_3_4
a 3
c 1
f 1
[[4]]
1_2_3_4_5
a 3
c 1
f 1
一半的列:
lapply(seq_len(ncol(mat)), f_combos, 0.5)
#all truncated to the first 3 rows
[[1]]
1_5 1_2 1_3 3_5 3_4 5_6 2_3 2_6
a 1 3 1 0 0 1 2 3
c 0 0 0 0 1 1 0 1
f 0 0 1 1 1 0 1 0
[[2]]
1_4_5 2_3_5 1_5_6 3_5_6 1_2_4 2_3_4 2_5_6 2_4_5 1_3_6 1_4_6
a 1 2 2 1 3 2 3 2 2 2
c 1 0 1 1 1 1 1 1 1 2
f 0 1 0 1 0 1 0 0 1 0
[[3]]
1_4_5_6 1_2_4_5 2_3_5_6 1_2_5_6 1_2_4_6 1_2_3_4 2_4_5_6 1_2_3_6
a 2 3 3 4 4 3 3 4
c 2 1 1 1 2 1 2 1
f 0 0 1 0 0 1 0 1
[[4]]
2_3_4_5_6 1_2_3_5_6 1_2_4_5_6
a 3 4 4
c 2 1 2
f 1 1 0
您正在查找大小 x
的 ncol(mat)
的所有组合 cx
,其中 x
从 2
增长到 ncol(mat) - 1
。然后,您需要样本 cx.s
个大小为每个样本 cx
一半的样本。从 mat
开始,您希望每个 cx.s
表示的列的 rowSums()
每个 x
.
此解决方案使用 RcppAlgos::comboGeneral()
,比 utils::combn()
快得多。
library(RcppAlgos)
set.seed(42) ## for sake of reproducibility
res <- lapply(2L:(ncol(mat) - 1), function(x) {
## matrix 'cx' of all combinations of size 'x'
cx <- comboGeneral(ncol(mat), x)
## sample of 'cx' of size 'ncol(cx)/2' (automatically adjusts downwards)
cx.s <- cx[sort(sample(nrow(cx), nrow(cx)/2)), ]
## apply aggregation on sampled columns of 'mat'
out <- lapply(1L:nrow(cx.s), function(k) {
## sample 'mat.cx.k' of 'mat'
mat.cx.k <- mat[, cx.s[k, ]]
## apply aggregation function and
## set attribute 's.cols' with the sampled col numbers
return(`attr<-`(
rowSums(mat.cx.k), ## <-- aggregation function here
"s.cols", paste(cx.s[k, ], collapse="_")))
})
## cbind to matrix and set colnames from attributes
return(`colnames<-`(do.call(cbind, out), Map(attr, out, "s.cols")))
})
更新(重新更新)
正如我们从 @Cole 中了解到的那样,RcppAlgos
包含一个 comboSample()
函数,该函数已经处理了组合的采样,因此我们能够编码更有效和简洁!
我从您对 的评论中了解到,您正在处理 600 列,使用 "sampling-half-of-possible-combinations-approach"...
comboCount(600, 600/2)
# Big Integer ('bigz') :
# [1] 135107941996194268514474877978504530397233945449193479925965721786474150408005716961950480198274469818673334131365837249043900490761151591695308427048536947621976068789875968372656
...因此,我在函数中实现了一种带有 max.comb
的安全性-switch()
。
FUN <- function(mat, max.comb=20) {
out <- lapply(2L:(ncol(mat) - 1), function(x) {
## all combinations of 'ncol(mat)' of size 'x' with half size (adjusts downwards)
do.call(cbind, {
n.comb <- comboCount(ncol(mat), x)/2L
comboSample(ncol(mat), x, n=switch((n.comb > max.comb)+1, floor(n.comb), max.comb),
FUN=function(i)
`colnames<-`(as.matrix(
rowSums(mat[, i])), ## <-- aggregation FUN here
paste(i, collapse="_")))
})
})
# out <- Map(function(x) x[, order(colnames(x))], # un-comment for ordered columns
# out) # (slower)
return(out)
}
set.seed(42) ## for sake of reproducibility
res2 <- FUN(mat, max.comb=20)
stopifnot(all.equal(res, res2)) # with ordered columns
结果
res
# [[1]]
# 1_2 1_3 1_5 1_6 2_6 3_4 5_6
# a 3 1 1 2 3 0 1
# c 0 0 0 1 1 1 1
# f 0 1 0 0 0 1 0
# h 0 1 0 0 0 1 0
# i 1 1 1 0 1 1 1
# j 0 0 0 0 0 1 0
# l 1 1 1 1 0 0 0
# m 0 0 1 0 0 0 1
# p 0 0 0 1 1 0 1
# q 0 0 1 0 0 0 1
# s 0 0 1 1 1 0 2
# t 0 0 0 2 2 0 2
# u 0 0 0 1 1 0 1
# v 0 0 1 0 0 0 1
# x 3 2 2 2 1 0 0
# z 0 0 1 0 0 0 1
#
# [[2]]
# 1_2_4 1_2_6 1_3_4 1_3_6 1_4_6 1_5_6 2_4_5 2_4_6 3_4_5 4_5_6
# a 3 4 1 2 2 2 2 3 0 1
# c 1 1 1 1 2 1 1 2 1 2
# f 0 0 1 1 0 0 0 0 1 0
# h 0 0 1 1 0 0 0 0 1 0
# i 1 1 1 1 0 1 2 1 2 1
# j 1 0 1 0 1 0 1 1 1 1
# l 1 1 1 1 1 1 0 0 0 0
# m 0 0 0 0 0 1 1 0 1 1
# p 0 1 0 1 1 1 0 1 0 1
# q 0 0 0 0 0 1 1 0 1 1
# s 0 1 0 1 1 2 1 1 1 2
# t 0 2 0 2 2 2 0 2 0 2
# u 0 1 0 1 1 1 0 1 0 1
# v 0 0 0 0 0 1 1 0 1 1
# x 3 3 2 2 2 2 1 1 0 0
# z 0 0 0 0 0 1 1 0 1 1
#
# [[3]]
# 1_2_3_6 1_2_4_5 1_2_4_6 1_3_5_6 2_3_4_5 2_3_5_6 3_4_5_6
# a 4 3 4 2 2 3 1
# c 1 1 2 1 1 1 2
# f 1 0 0 1 1 1 1
# h 1 0 0 1 1 1 1
# i 2 2 1 2 3 3 2
# j 0 1 1 0 1 0 1
# l 1 1 1 1 0 0 0
# m 0 1 0 1 1 1 1
# p 1 0 1 1 0 1 1
# q 0 1 0 1 1 1 1
# s 1 1 1 2 1 2 2
# t 2 0 2 2 0 2 2
# u 1 0 1 1 0 1 1
# v 0 1 0 1 1 1 1
# x 3 3 3 2 1 1 0
# z 0 1 0 1 1 1 1
#
# [[4]]
# 1_2_4_5_6 1_3_4_5_6 2_3_4_5_6
# a 4 2 3
# c 2 2 2
# f 0 1 1
# h 0 1 1
# i 2 2 3
# j 1 1 1
# l 1 1 0
# m 1 1 1
# p 1 1 1
# q 1 1 1
# s 2 2 2
# t 2 2 2
# u 1 1 1
# v 1 1 1
# x 3 2 1
# z 1 1 1
在 16x600 矩阵上测试给出了非常有用的时序结果:
system.time(FUN(mat[, sample(ncol(mat), 600, replace=TRUE)], max.comb=20))
# user system elapsed
# 2.91 0.00 2.95
@Combs array
方法可能会带来额外的 % 性能。
如有必要,您还可以将未抽样的mat
的行总和与append()
相加。
res <- append(res,
list(`colnames<-`(as.matrix(rowSums(mat)), paste(1:ncol(mat), collapse="_"))))
数据
mat <- structure(c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0), .Dim = c(16L,
6L), .Dimnames = list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v", "x", "z"), c("1", "2", "3", "4",
"5", "6")))
我有一个像这样的矩阵:
mat <- matrix(c(1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,
2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,0,0,0,0,1,0,1,2,1,0,0,0), nrow=16, ncol=6)
dimnames(mat) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v","x", "z"),
c("1", "2", "3", "4", "5", "6"))
我想对列进行分组或分类,然后聚合每个组的数据。对大小为 x 的 bin 重复采样 n 次。对于 x+1 的 bin 大小将重复此过程。
对于第一次迭代,两个随机列被合并。我想在不替换的情况下进行采样,这样两列的组合就不会被采样两次(但是,如果一个列与不同的列配对,则可以使用两次)。聚合被定义为计算合并列的行总和。聚合结果将作为新列添加到该 bin 大小的结果矩阵中。结果矩阵中的列数将限制为随机采样的 bin 数。
Bin 大小继续变得越来越大。对于下一次迭代,bin 大小增加到 3,以便聚合 3 个随机 selected 列。汇总数据将放入不同的结果矩阵中。这个过程将一直持续到 bin 达到数据框的大小为止,在这种情况下无法进行重采样。所有结果矩阵都将放入矩阵列表中。
下面是给定上述矩阵的前两个 bin 大小的预期结果 resultList
。
# Bin size =2
# The randomly sampled columns are columns 1&2, 2&3, 3&4, 4&5, 5&6.
mat1 <- matrix(c(3,0,0,0,1,0,1,0,0,0,0,0,0,0,2,0,
2,0,1,1,2,0,0,0,0,0,0,0,0,0,1,0,
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,1,1,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,1,0,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat1) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v","x", "z"),
c("1_2", "2_3", "3_4", "4_5", "5_6"))
# Bin size= 3
# The randomly selected columns to be joined are columns 1,2&3,
# 2,3&4, 3,4&5, 4,5&6.
mat2 <- matrix(c(3,0,1,1,2,0,1,0,0,0,0,0,0,0,3,0,
2,1,1,1,2,1,0,0,0,0,0,0,0,0,1,0,
0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,
1,2,0,0,1,1,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat2) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v","x", "z"),
c("1_2_3", "2_3_4", "3_4_5", "4_5_6"))
resultList <- list(mat1, mat2)
我在这里发布了一个类似的替代分箱技术问题:
这是我尝试随机合并 selected 列并将每个 bin 大小的结果放入矩阵列表中的尝试。我尝试使用 sample
select j
随机列,执行 rowSums
并删除那些 selected j
配对列,这样它们就不会重复在下一次迭代中:
lapply(seq_len(ncol(mat) - 1), function(j)
do.call(cbind,
lapply(sample(ncol(mat) - j, size= ), function(i)
rowSums(mat[, i:(i - j)]))))
根据您希望在最终输出中有多少列,我们可以修改该方法,但目前这提供了所有可能的组合。
#Get column names of the matrices
all_cols <- colnames(mat)
#Select bin value from 2:ncol(mat)
total_out <- lapply(seq_len(ncol(mat))[-1], function(j) {
#Create all combinations taking j items at a time
temp <- combn(all_cols, j, function(x)
#Take rowSums for the current combination
#Also paste column names to assign column names later
list(rowSums(mat[, x]), paste0(x, collapse = "_")), simplify = FALSE)
#Combine rowSums matrix
new_mat <- sapply(temp, `[[`, 1)
#Assign column names
colnames(new_mat) <- sapply(temp, `[[`, 2)
#Return new matrix
new_mat
})
当前输出看起来像
total_out
#[[1]]
# 1_2 1_3 1_4 1_5 1_6 2_3 2_4 2_5 2_6 3_4 3_5 3_6 4_5 4_6 5_6
#a 3 1 1 1 2 2 2 2 3 0 0 1 0 1 1
#c 0 0 1 0 1 0 1 0 1 1 0 1 1 2 1
#f 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0
#h 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0
#i 1 1 0 1 0 2 1 2 1 1 2 1 1 0 1
#j 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0
#l 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
#m 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#p 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1
#q 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#s 0 0 0 1 1 0 0 1 1 0 1 1 1 1 2
#t 0 0 0 0 2 0 0 0 2 0 0 2 0 2 2
#u 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1
#v 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#x 3 2 2 2 2 1 1 1 1 0 0 0 0 0 0
#z 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1
#...
#....
#....
#[[5]]
# 1_2_3_4_5_6
#a 4
#c 2
#f 1
#h 1
#i 3
#j 1
#l 1
#m 1
#p 1
#q 1
#s 2
#t 2
#u 1
#v 1
#x 3
#z 1
请注意,total_out
中共有 5 (ncol - 1
) 个矩阵,列数为
length(total_out)
#[1] 5
sapply(total_out, ncol)
#[1] 15 20 15 6 1
因为我们知道列表中的最后一个元素将是一个单列矩阵,所以我们可以删除它们并从剩余矩阵中删除 select 随机 nc/2
列。
total_out <- total_out[-length(total_out)]
lapply(total_out, function(x) {
nc <- ncol(x)
x[, sample(nc, ceiling(nc/2))]
})
此函数包括选项,用于选择 prob
中组合的百分比或直接选择每个列采样的最大数量 cols
。
f_combos <- function(k, prob = 1L, cols = NULL) {
## k is the number of columns used in each combination
## prob is 0L < prob < 1L which is the percentage of each
## combination that will be included in the final output
## cols is the number of columns of each combination that
## will be included in the final output.
## If cols is passed, it is prioritized over any prob.
#Returns original matrix if you send k = 1 (i.e., one column) or
# will send rowSums of original matrix if k == ncols.
if (k == 1){
return(mat)
} else if (k == ncol(mat)) {
return(matrix(rowSums(mat),
dimnames = list(rownames(mat),
paste(colnames(mat),
collapse = '_'))
)
)
}
#create all unique combinations based on k columns selected
all_combos <- combn(ncol(mat), k)
#determines how many combos will be used
if (!is.null(cols)){
n <- ifelse(cols < ncol(all_combos), cols, ncol(all_combos))
} else if (prob < 1L) {
n <- ceiling(prob * ncol(all_combos))
} else {
n <- ncol(all_combos)
}
#resamples the combos if necessary
if (n < ncol(all_combos)){
all_combos <- all_combos[, sample(ncol(all_combos), n)]
if ( n == 1) {
return(matrix(rowSums(mat[, all_combos]),
dimnames = list(rownames(mat),
paste(all_combos,
collapse = '_'))
)
)
}
}
#this subsets the matrix all at once.
##Then, array() creates a M x k x N array.
## The array is then transposed with aperm() to more efficiently calculate everything
###with colSums
colSums(
aperm(
array(
mat[, as.vector(all_combos)],
dim = c(nrow(mat), k, ncol(all_combos)),
dimnames = list(rownames(mat), NULL, apply(all_combos, 2, paste0, collapse = '_'))
),
perm = c(2,1,3)
)
)
}
此函数在很大程度上简化为(使用 RcppAlgos
):
library(RcppAlgos)
lapply(seq_len(ncol(mat)-1)[-1],
function(k) {
n_combos <- ceiling(comboCount(ncol(mat), k) / 2)
all_combos <- comboSample(ncol(mat), k, n = n_combos)
colSums(
aperm(
array(
mat[, as.vector(t(all_combos))],
dim = c(nrow(mat), k, nrow(all_combos)),
dimnames = list(rownames(mat), NULL, apply(all_combos, 1, paste0, collapse = '_'))
),
perm = c(2,1,3)
)
)
}
)
该函数适用于:
lapply(seq_len(ncol(mat)), f_combos)
如果包含f_combos(k = 1)
,第一个列表输出将是原始矩阵。所有其他 returns 将基于可选参数 prob
和 cols
.
实际操作:仅获取 1 列:
lapply(seq_len(ncol(mat)-1)[-1], f_combos, , 1)
#all truncated to the first 3 rows
[[1]]
2_5
a 2
c 0
f 0
[[2]]
1_2_4
a 3
c 1
f 0
[[3]]
1_2_3_4
a 3
c 1
f 1
[[4]]
1_2_3_4_5
a 3
c 1
f 1
一半的列:
lapply(seq_len(ncol(mat)), f_combos, 0.5)
#all truncated to the first 3 rows
[[1]]
1_5 1_2 1_3 3_5 3_4 5_6 2_3 2_6
a 1 3 1 0 0 1 2 3
c 0 0 0 0 1 1 0 1
f 0 0 1 1 1 0 1 0
[[2]]
1_4_5 2_3_5 1_5_6 3_5_6 1_2_4 2_3_4 2_5_6 2_4_5 1_3_6 1_4_6
a 1 2 2 1 3 2 3 2 2 2
c 1 0 1 1 1 1 1 1 1 2
f 0 1 0 1 0 1 0 0 1 0
[[3]]
1_4_5_6 1_2_4_5 2_3_5_6 1_2_5_6 1_2_4_6 1_2_3_4 2_4_5_6 1_2_3_6
a 2 3 3 4 4 3 3 4
c 2 1 1 1 2 1 2 1
f 0 0 1 0 0 1 0 1
[[4]]
2_3_4_5_6 1_2_3_5_6 1_2_4_5_6
a 3 4 4
c 2 1 2
f 1 1 0
您正在查找大小 x
的 ncol(mat)
的所有组合 cx
,其中 x
从 2
增长到 ncol(mat) - 1
。然后,您需要样本 cx.s
个大小为每个样本 cx
一半的样本。从 mat
开始,您希望每个 cx.s
表示的列的 rowSums()
每个 x
.
此解决方案使用 RcppAlgos::comboGeneral()
,比 utils::combn()
快得多。
library(RcppAlgos)
set.seed(42) ## for sake of reproducibility
res <- lapply(2L:(ncol(mat) - 1), function(x) {
## matrix 'cx' of all combinations of size 'x'
cx <- comboGeneral(ncol(mat), x)
## sample of 'cx' of size 'ncol(cx)/2' (automatically adjusts downwards)
cx.s <- cx[sort(sample(nrow(cx), nrow(cx)/2)), ]
## apply aggregation on sampled columns of 'mat'
out <- lapply(1L:nrow(cx.s), function(k) {
## sample 'mat.cx.k' of 'mat'
mat.cx.k <- mat[, cx.s[k, ]]
## apply aggregation function and
## set attribute 's.cols' with the sampled col numbers
return(`attr<-`(
rowSums(mat.cx.k), ## <-- aggregation function here
"s.cols", paste(cx.s[k, ], collapse="_")))
})
## cbind to matrix and set colnames from attributes
return(`colnames<-`(do.call(cbind, out), Map(attr, out, "s.cols")))
})
更新(重新更新)
正如我们从 @Cole 中了解到的那样,RcppAlgos
包含一个 comboSample()
函数,该函数已经处理了组合的采样,因此我们能够编码更有效和简洁!
我从您对
comboCount(600, 600/2)
# Big Integer ('bigz') :
# [1] 135107941996194268514474877978504530397233945449193479925965721786474150408005716961950480198274469818673334131365837249043900490761151591695308427048536947621976068789875968372656
...因此,我在函数中实现了一种带有 max.comb
的安全性-switch()
。
FUN <- function(mat, max.comb=20) {
out <- lapply(2L:(ncol(mat) - 1), function(x) {
## all combinations of 'ncol(mat)' of size 'x' with half size (adjusts downwards)
do.call(cbind, {
n.comb <- comboCount(ncol(mat), x)/2L
comboSample(ncol(mat), x, n=switch((n.comb > max.comb)+1, floor(n.comb), max.comb),
FUN=function(i)
`colnames<-`(as.matrix(
rowSums(mat[, i])), ## <-- aggregation FUN here
paste(i, collapse="_")))
})
})
# out <- Map(function(x) x[, order(colnames(x))], # un-comment for ordered columns
# out) # (slower)
return(out)
}
set.seed(42) ## for sake of reproducibility
res2 <- FUN(mat, max.comb=20)
stopifnot(all.equal(res, res2)) # with ordered columns
结果
res
# [[1]]
# 1_2 1_3 1_5 1_6 2_6 3_4 5_6
# a 3 1 1 2 3 0 1
# c 0 0 0 1 1 1 1
# f 0 1 0 0 0 1 0
# h 0 1 0 0 0 1 0
# i 1 1 1 0 1 1 1
# j 0 0 0 0 0 1 0
# l 1 1 1 1 0 0 0
# m 0 0 1 0 0 0 1
# p 0 0 0 1 1 0 1
# q 0 0 1 0 0 0 1
# s 0 0 1 1 1 0 2
# t 0 0 0 2 2 0 2
# u 0 0 0 1 1 0 1
# v 0 0 1 0 0 0 1
# x 3 2 2 2 1 0 0
# z 0 0 1 0 0 0 1
#
# [[2]]
# 1_2_4 1_2_6 1_3_4 1_3_6 1_4_6 1_5_6 2_4_5 2_4_6 3_4_5 4_5_6
# a 3 4 1 2 2 2 2 3 0 1
# c 1 1 1 1 2 1 1 2 1 2
# f 0 0 1 1 0 0 0 0 1 0
# h 0 0 1 1 0 0 0 0 1 0
# i 1 1 1 1 0 1 2 1 2 1
# j 1 0 1 0 1 0 1 1 1 1
# l 1 1 1 1 1 1 0 0 0 0
# m 0 0 0 0 0 1 1 0 1 1
# p 0 1 0 1 1 1 0 1 0 1
# q 0 0 0 0 0 1 1 0 1 1
# s 0 1 0 1 1 2 1 1 1 2
# t 0 2 0 2 2 2 0 2 0 2
# u 0 1 0 1 1 1 0 1 0 1
# v 0 0 0 0 0 1 1 0 1 1
# x 3 3 2 2 2 2 1 1 0 0
# z 0 0 0 0 0 1 1 0 1 1
#
# [[3]]
# 1_2_3_6 1_2_4_5 1_2_4_6 1_3_5_6 2_3_4_5 2_3_5_6 3_4_5_6
# a 4 3 4 2 2 3 1
# c 1 1 2 1 1 1 2
# f 1 0 0 1 1 1 1
# h 1 0 0 1 1 1 1
# i 2 2 1 2 3 3 2
# j 0 1 1 0 1 0 1
# l 1 1 1 1 0 0 0
# m 0 1 0 1 1 1 1
# p 1 0 1 1 0 1 1
# q 0 1 0 1 1 1 1
# s 1 1 1 2 1 2 2
# t 2 0 2 2 0 2 2
# u 1 0 1 1 0 1 1
# v 0 1 0 1 1 1 1
# x 3 3 3 2 1 1 0
# z 0 1 0 1 1 1 1
#
# [[4]]
# 1_2_4_5_6 1_3_4_5_6 2_3_4_5_6
# a 4 2 3
# c 2 2 2
# f 0 1 1
# h 0 1 1
# i 2 2 3
# j 1 1 1
# l 1 1 0
# m 1 1 1
# p 1 1 1
# q 1 1 1
# s 2 2 2
# t 2 2 2
# u 1 1 1
# v 1 1 1
# x 3 2 1
# z 1 1 1
在 16x600 矩阵上测试给出了非常有用的时序结果:
system.time(FUN(mat[, sample(ncol(mat), 600, replace=TRUE)], max.comb=20))
# user system elapsed
# 2.91 0.00 2.95
@Combs array
方法可能会带来额外的 % 性能。
如有必要,您还可以将未抽样的mat
的行总和与append()
相加。
res <- append(res,
list(`colnames<-`(as.matrix(rowSums(mat)), paste(1:ncol(mat), collapse="_"))))
数据
mat <- structure(c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0), .Dim = c(16L,
6L), .Dimnames = list(c("a", "c", "f", "h", "i", "j", "l", "m",
"p", "q", "s", "t", "u", "v", "x", "z"), c("1", "2", "3", "4",
"5", "6")))