计算R中矩阵中有序对的数量
Counting the number of ordered pairs in matrix in R
给定矩阵m
如下(1-5的逐行排列):
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 5 2 4 3
# [2,] 2 1 4 3 5
# [3,] 3 4 1 2 5
# [4,] 4 1 3 2 5
# [5,] 4 3 1 2 5
# [6,] 1 4 2 3 5
# [7,] 4 3 2 5 1
# [8,] 4 1 3 5 2
# [9,] 1 2 3 4 5
# [10,] 4 3 2 1 5
我想知道每行中每个元素 1-5 在另一个元素之前的次数(即考虑所有可能的对)
例如,对于 (1, 5) 对,1
在 5
之前,在所有行中有 9 次。另一个例子,对于 (3, 1) 对,3
在 1
之前,在所有行中出现了 4 次。我希望所有行中所有可能的对都得到相同的结果。也就是说,
# (1, 2), (1, 3), (1, 4), (1, 5)
# (2, 1), (2, 3), (2, 4), (2, 5)
# (3, 1), (3, 2), (3, 4), (3, 5)
# (4, 1), (4, 2), (4, 3), (4, 5)
# (5, 1), (5, 2), (5, 3), (5, 4)
m <- structure(c(1L, 2L, 3L, 4L, 4L, 1L, 4L, 4L, 1L, 4L, 5L, 1L, 4L,
1L, 3L, 4L, 3L, 1L, 2L, 3L, 2L, 4L, 1L, 3L, 1L, 2L, 2L, 3L, 3L,
2L, 4L, 3L, 2L, 2L, 2L, 3L, 5L, 5L, 4L, 1L, 3L, 5L, 5L, 5L, 5L,
5L, 1L, 2L, 5L, 5L), .Dim = c(10L, 5L))
如何在 R 中高效地做到这一点?
编辑
你会如何为这个矩阵做同样的事情?
# [,1] [,2] [,3] [,4] [,5]
# [1,] 3 4 1 5 0
# [2,] 1 2 5 3 0
# [3,] 3 5 0 0 0
# [4,] 4 5 0 0 0
# [5,] 3 4 1 5 2
# [6,] 3 1 2 0 0
# [7,] 4 1 5 2 0
# [8,] 4 3 5 2 0
# [9,] 5 2 0 0 0
# [10,] 5 4 2 0 0
m <- structure(c(3, 1, 3, 4, 3, 3, 4, 4, 5, 5, 4, 2, 5, 5, 4, 1, 1,
3, 2, 4, 1, 5, 0, 0, 1, 2, 5, 5, 0, 2, 5, 3, 0, 0, 5, 0, 2, 2,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0), .Dim = c(10L, 5L))
首先,我们如何使用硬编码数字来实现:
apply(m, 1, function(r) { which(r == 1) < which(r == 5) })
# [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
sum(apply(m, 1, function(r) { which(r == 1) < which(r == 5) }))
# [1] 9
要针对 1:5
的所有组合(相同除外)自动执行此操作,这里是一个包含所有对的 data.frame:
df <- expand.grid(a = 1:5, b = 1:5)
df <- df[ df$a != df$b, ]
head(df)
# a b
# 2 2 1
# 3 3 1
# 4 4 1
# 5 5 1
# 6 1 2
# 8 3 2
现在我们只需要迭代每一行(我想我们可以使用另一个矩阵 apply
):
df$seqs <- sapply(seq_len(nrow(df)), function(i) {
sum(apply(m, 1, function(r) which(r == df$a[i]) < which(r == df$b[i])))
})
head(df)
# a b seqs
# 2 2 1 3
# 3 3 1 4
# 4 4 1 6
# 5 5 1 1
# 6 1 2 7
# 8 3 2 6
或者,我认为现在是使用 mapply
:
的好时机
myfunc <- function(a, b, m) sum(apply(m, 1, function(r) which(r == a) < which(r == b)))
df$seqs <- mapply(myfunc, df$a, df$b, list(m))
head(df)
# a b seqs
# 2 2 1 3
# 3 3 1 4
# 4 4 1 6
# 5 5 1 1
# 6 1 2 7
# 8 3 2 6
当然,我使用了正式函数代替匿名函数(也可以在上面这样做),但这展示了这种方法的一些优雅。
编辑:新约束,现在 m
中可能没有匹配项。上面的失败是因为 which
returns logical(0)
当没有匹配时,导致 sapply
到 return 一个异构列表。修复它的一种方法是使用快速辅助函数:
apply(m, 1, function(r) which(r == a) < which(r == b))
# [[1]]
# logical(0)
# [[2]]
# [1] FALSE
# ...
emptyF <- function(x) sapply(x, function(y) if (! length(y)) FALSE else y)
emptyF(apply(m, 1, function(r) which(r == a) < which(r == b)))
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
现在 myfunc
变成:
myfunc <- function(a, b, m) sum(emptyF(apply(m, 1, function(r) which(r == a) < which(r == b))))
(注意:我更喜欢 ,因为它是矢量化的,因此可能更快。它也会从这个和类似的补救措施中受益,而不是不匹配。))
这是一个向量化的解决方案,没有 apply
:
func <- function(a,b) sum((which(!t(m-b)) - which(!t(m-a)))>0)
#> func(1,5)
#[1] 9
#> func(5,1)
#[1] 1
要生成所有想要的组合,您只需执行以下操作:
N = combn(1:5, 2)
cbind(N, N[nrow(N):1,])
然后您只需要一个循环来遍历列并应用该函数。
试试这个
library(plyr)
combns <- expand.grid(unique(as.vector(m)),unique(as.vector(m)))
combns <- combns[combns$Var1!=combns$Var2,]
combns <- combns[with(combns,order(Var1)),]
combns$count <- sapply(1:nrow(combns),function(u) sum(unlist(apply(apply(m,1,function(t) match(t,combns[u,])),2,function(s) na.exclude(count(unlist(sapply(seq(length(s)),function(t) diff(s,lag=t))))$freq[count(unlist(sapply(seq(length(s)),function(t) diff(s,lag=t))))$x==1]))),na.rm = T))
知道 (1) 每行没有重复,(2) 每行的 0 聚集在末尾,(3) nrow(m)
比 [=15 大 2-3 个数量级=],我们可以遍历列来搜索特定数字的出现,从而在达到 0 时减少不必要的计算:
ff = function(x, a, b)
{
ia = rep_len(NA_integer_, nrow(x)) # positions of 'a' in each row
ib = rep_len(NA_integer_, nrow(x)) # -//- of 'b'
notfound0 = seq_len(nrow(x)) # rows that have not, yet, a 0
for(j in seq_len(ncol(x))) {
xj = x[notfound0, j]
if(!length(xj)) break
ia[notfound0[xj == a]] = j
ib[notfound0[xj == b]] = j
notfound0 = notfound0[xj != 0L] # check if any more rows have 0 now on
}
i = ia < ib ## is 'a' before 'b'?
## return both a - b and b - a; no need to repeat computations
data.frame(a = c(a, b),
b = c(b, a),
n = c(sum(i, na.rm = TRUE), sum(!i, na.rm = TRUE)))
}
在编辑后 m
:
ff(m, 3, 2)
# a b n
#1 3 2 3
#2 2 3 1
ff(m, 5, 1)
# a b n
#1 5 1 0
#2 1 5 4
所有对:
xtabs(n ~ a + b,
do.call(rbind,
combn(5, 2, function(x) ff(m, x[1], x[2]),
simplify = FALSE)))
# b
#a 1 2 3 4 5
# 1 0 4 1 0 4
# 2 0 0 1 0 1
# 3 3 3 0 2 4
# 4 3 4 1 0 5
# 5 0 5 1 1 0
而且,在更大范围内似乎也可以容忍:
set.seed(007)
MAT = do.call(rbind, combinat::permn(8))[sample(1e4), ]
MAT[sample(length(MAT), length(MAT)*0.4)] = 0L #40% 0s
MAT = t(apply(MAT, 1, function(x) c(x[x != 0L], rep_len(0L, sum(x == 0L)))))
dim(MAT)
#[1] 10000 8
## including colonel's answer for a quick comparison
colonel = function(x, a, b)
{
i = (which(!t(x - b)) - which(!t(x - a))) > 0L
data.frame(a = c(a, b), b = c(b, a), n = c(sum(i), sum(!i)))
}
microbenchmark::microbenchmark(ff(MAT, 7, 2), colonel(MAT, 7, 2))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# ff(MAT, 7, 2) 3.795003 3.908802 4.500453 3.972138 4.096377 45.926679 100 b
# colonel(MAT, 7, 2) 2.156941 2.231587 2.423053 2.295794 2.404894 3.775516 100 a
#There were 50 or more warnings (use warnings() to see the first 50)
因此,事实证明,只需将方法简单地转换为循环就足够有效了。更多的 0 也应该进一步减少计算时间。
给定矩阵m
如下(1-5的逐行排列):
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 5 2 4 3
# [2,] 2 1 4 3 5
# [3,] 3 4 1 2 5
# [4,] 4 1 3 2 5
# [5,] 4 3 1 2 5
# [6,] 1 4 2 3 5
# [7,] 4 3 2 5 1
# [8,] 4 1 3 5 2
# [9,] 1 2 3 4 5
# [10,] 4 3 2 1 5
我想知道每行中每个元素 1-5 在另一个元素之前的次数(即考虑所有可能的对)
例如,对于 (1, 5) 对,1
在 5
之前,在所有行中有 9 次。另一个例子,对于 (3, 1) 对,3
在 1
之前,在所有行中出现了 4 次。我希望所有行中所有可能的对都得到相同的结果。也就是说,
# (1, 2), (1, 3), (1, 4), (1, 5)
# (2, 1), (2, 3), (2, 4), (2, 5)
# (3, 1), (3, 2), (3, 4), (3, 5)
# (4, 1), (4, 2), (4, 3), (4, 5)
# (5, 1), (5, 2), (5, 3), (5, 4)
m <- structure(c(1L, 2L, 3L, 4L, 4L, 1L, 4L, 4L, 1L, 4L, 5L, 1L, 4L,
1L, 3L, 4L, 3L, 1L, 2L, 3L, 2L, 4L, 1L, 3L, 1L, 2L, 2L, 3L, 3L,
2L, 4L, 3L, 2L, 2L, 2L, 3L, 5L, 5L, 4L, 1L, 3L, 5L, 5L, 5L, 5L,
5L, 1L, 2L, 5L, 5L), .Dim = c(10L, 5L))
如何在 R 中高效地做到这一点?
编辑
你会如何为这个矩阵做同样的事情?
# [,1] [,2] [,3] [,4] [,5]
# [1,] 3 4 1 5 0
# [2,] 1 2 5 3 0
# [3,] 3 5 0 0 0
# [4,] 4 5 0 0 0
# [5,] 3 4 1 5 2
# [6,] 3 1 2 0 0
# [7,] 4 1 5 2 0
# [8,] 4 3 5 2 0
# [9,] 5 2 0 0 0
# [10,] 5 4 2 0 0
m <- structure(c(3, 1, 3, 4, 3, 3, 4, 4, 5, 5, 4, 2, 5, 5, 4, 1, 1,
3, 2, 4, 1, 5, 0, 0, 1, 2, 5, 5, 0, 2, 5, 3, 0, 0, 5, 0, 2, 2,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0), .Dim = c(10L, 5L))
首先,我们如何使用硬编码数字来实现:
apply(m, 1, function(r) { which(r == 1) < which(r == 5) })
# [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
sum(apply(m, 1, function(r) { which(r == 1) < which(r == 5) }))
# [1] 9
要针对 1:5
的所有组合(相同除外)自动执行此操作,这里是一个包含所有对的 data.frame:
df <- expand.grid(a = 1:5, b = 1:5)
df <- df[ df$a != df$b, ]
head(df)
# a b
# 2 2 1
# 3 3 1
# 4 4 1
# 5 5 1
# 6 1 2
# 8 3 2
现在我们只需要迭代每一行(我想我们可以使用另一个矩阵 apply
):
df$seqs <- sapply(seq_len(nrow(df)), function(i) {
sum(apply(m, 1, function(r) which(r == df$a[i]) < which(r == df$b[i])))
})
head(df)
# a b seqs
# 2 2 1 3
# 3 3 1 4
# 4 4 1 6
# 5 5 1 1
# 6 1 2 7
# 8 3 2 6
或者,我认为现在是使用 mapply
:
myfunc <- function(a, b, m) sum(apply(m, 1, function(r) which(r == a) < which(r == b)))
df$seqs <- mapply(myfunc, df$a, df$b, list(m))
head(df)
# a b seqs
# 2 2 1 3
# 3 3 1 4
# 4 4 1 6
# 5 5 1 1
# 6 1 2 7
# 8 3 2 6
当然,我使用了正式函数代替匿名函数(也可以在上面这样做),但这展示了这种方法的一些优雅。
编辑:新约束,现在 m
中可能没有匹配项。上面的失败是因为 which
returns logical(0)
当没有匹配时,导致 sapply
到 return 一个异构列表。修复它的一种方法是使用快速辅助函数:
apply(m, 1, function(r) which(r == a) < which(r == b))
# [[1]]
# logical(0)
# [[2]]
# [1] FALSE
# ...
emptyF <- function(x) sapply(x, function(y) if (! length(y)) FALSE else y)
emptyF(apply(m, 1, function(r) which(r == a) < which(r == b)))
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
现在 myfunc
变成:
myfunc <- function(a, b, m) sum(emptyF(apply(m, 1, function(r) which(r == a) < which(r == b))))
(注意:我更喜欢
这是一个向量化的解决方案,没有 apply
:
func <- function(a,b) sum((which(!t(m-b)) - which(!t(m-a)))>0)
#> func(1,5)
#[1] 9
#> func(5,1)
#[1] 1
要生成所有想要的组合,您只需执行以下操作:
N = combn(1:5, 2)
cbind(N, N[nrow(N):1,])
然后您只需要一个循环来遍历列并应用该函数。
试试这个
library(plyr)
combns <- expand.grid(unique(as.vector(m)),unique(as.vector(m)))
combns <- combns[combns$Var1!=combns$Var2,]
combns <- combns[with(combns,order(Var1)),]
combns$count <- sapply(1:nrow(combns),function(u) sum(unlist(apply(apply(m,1,function(t) match(t,combns[u,])),2,function(s) na.exclude(count(unlist(sapply(seq(length(s)),function(t) diff(s,lag=t))))$freq[count(unlist(sapply(seq(length(s)),function(t) diff(s,lag=t))))$x==1]))),na.rm = T))
知道 (1) 每行没有重复,(2) 每行的 0 聚集在末尾,(3) nrow(m)
比 [=15 大 2-3 个数量级=],我们可以遍历列来搜索特定数字的出现,从而在达到 0 时减少不必要的计算:
ff = function(x, a, b)
{
ia = rep_len(NA_integer_, nrow(x)) # positions of 'a' in each row
ib = rep_len(NA_integer_, nrow(x)) # -//- of 'b'
notfound0 = seq_len(nrow(x)) # rows that have not, yet, a 0
for(j in seq_len(ncol(x))) {
xj = x[notfound0, j]
if(!length(xj)) break
ia[notfound0[xj == a]] = j
ib[notfound0[xj == b]] = j
notfound0 = notfound0[xj != 0L] # check if any more rows have 0 now on
}
i = ia < ib ## is 'a' before 'b'?
## return both a - b and b - a; no need to repeat computations
data.frame(a = c(a, b),
b = c(b, a),
n = c(sum(i, na.rm = TRUE), sum(!i, na.rm = TRUE)))
}
在编辑后 m
:
ff(m, 3, 2)
# a b n
#1 3 2 3
#2 2 3 1
ff(m, 5, 1)
# a b n
#1 5 1 0
#2 1 5 4
所有对:
xtabs(n ~ a + b,
do.call(rbind,
combn(5, 2, function(x) ff(m, x[1], x[2]),
simplify = FALSE)))
# b
#a 1 2 3 4 5
# 1 0 4 1 0 4
# 2 0 0 1 0 1
# 3 3 3 0 2 4
# 4 3 4 1 0 5
# 5 0 5 1 1 0
而且,在更大范围内似乎也可以容忍:
set.seed(007)
MAT = do.call(rbind, combinat::permn(8))[sample(1e4), ]
MAT[sample(length(MAT), length(MAT)*0.4)] = 0L #40% 0s
MAT = t(apply(MAT, 1, function(x) c(x[x != 0L], rep_len(0L, sum(x == 0L)))))
dim(MAT)
#[1] 10000 8
## including colonel's answer for a quick comparison
colonel = function(x, a, b)
{
i = (which(!t(x - b)) - which(!t(x - a))) > 0L
data.frame(a = c(a, b), b = c(b, a), n = c(sum(i), sum(!i)))
}
microbenchmark::microbenchmark(ff(MAT, 7, 2), colonel(MAT, 7, 2))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# ff(MAT, 7, 2) 3.795003 3.908802 4.500453 3.972138 4.096377 45.926679 100 b
# colonel(MAT, 7, 2) 2.156941 2.231587 2.423053 2.295794 2.404894 3.775516 100 a
#There were 50 or more warnings (use warnings() to see the first 50)
因此,事实证明,只需将方法简单地转换为循环就足够有效了。更多的 0 也应该进一步减少计算时间。