通过 R 中另一列的成对组合计算一列的唯一值
Count unique values of a column by pairwise combinations of another column in R
假设我有以下数据框:
ID Code
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 2 D
7 3 C
8 3 A
9 3 D
10 3 B
11 4 D
12 4 B
我想通过列 "Code":
的成对组合来获取列 "ID" 的唯一值的计数
Code.Combinations Count.of.ID
1 A, B 2
2 A, C 2
3 A, D 1
4 B, C 3
5 B, D 3
6 C, D 2
我已经在网上搜索了解决方案,目前还没有达到预期的效果。
任何帮助,将不胜感激。谢谢!
假设您的 data.frame 被命名为 df
并使用 dplyr
df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
自己加入df然后按组数
这是解决问题的data.table
方法。使用 combn
函数获取所有可能的代码组合,然后计算每个唯一的 ID CodeComb
:
library(data.table)
setDT(df)[, .(CodeComb = sapply(combn(Code, 2, simplify = F),
function(cmb) paste(sort(cmb), collapse = ", "))), .(ID)]
# list all combinations of Code for each ID
[, .(IdCount = .N), .(CodeComb)]
# count number of unique id for each code combination
# CodeComb IdCount
# 1: A, B 2
# 2: A, C 2
# 3: B, C 3
# 4: B, D 3
# 5: C, D 2
# 6: A, D 1
下面使用了 gtools
包中的 combinations
以及 plyr
包中的 count
。
library(gtools)
library(plyr)
PairWiseCombo <- function(df) {
myID <- df$ID
BreakDown <- rle(myID)
Unis <- BreakDown$values
numUnis <- BreakDown$lengths
Len <- length(Unis)
e <- cumsum(numUnis)
s <- c(1L, e + 1L)
## more efficient to generate outside of the "do.call(c, lapply(.."
## below. This allows me to reference a particular combination
## rather than re-generating the same combination multiple times
myCombs <- lapply(2:max(numUnis), function(x) combinations(x,2L))
tempDF <- plyr::count(do.call(c, lapply(1:Len, function(i) {
myRange <- s[i]:e[i]
combs <- myCombs[[numUnis[i]-1L]]
vapply(1:nrow(combs), function(j) paste(sort(df$Code[myRange[combs[j,]]]), collapse = ","), "A,D")
})))
names(tempDF) <- c("Code.Combinations", "Count.of.ID")
tempDF
}
以下是一些指标。我没有测试@Carl 的解决方案,因为它给出的结果与其他解决方案不同。
set.seed(537)
ID <- do.call(c, lapply(1:100, function(x) rep(x, sample(2:26,1))))
temp <- rle(ID)
Code <- do.call(c, lapply(1:100, function(x) LETTERS[sample(temp$lengths[x])]))
TestDF <- data.frame(ID, Code, stringsAsFactors = FALSE)
system.time(t1 <- Noah(TestDF))
user system elapsed
97.05 0.31 97.42
system.time(t2 <- DTSolution(TestDF))
user system elapsed
0.43 0.00 0.42
system.time(t3 <- PairWiseCombo(TestDF))
user system elapsed
0.42 0.00 0.42
identical(sort(t3[,2]),sort(t2$IdCount))
TRUE
identical(sort(t3[,2]),sort(t1[,2]))
TRUE
使用 microbenchmark
我们有:
library(microbenchmark)
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 420.1090 433.9471 442.0133 446.4880 450.4420 452.7852 10
Psidom 396.8444 413.4933 416.3315 418.5573 420.9669 423.6303 10
总体而言,@Psidom 提供的 data.table
解决方案是最快的(不足为奇)。我的解决方案和 data.table
解决方案在非常大的示例上表现相似。然而,@Noah 提供的解决方案非常占用内存,无法在更大的数据帧上进行测试。
sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
更新
在调整@Carl 的解决方案之后,dplyr
方法是迄今为止最快的。下面是代码(你会看到我改变了哪些部分):
DPLYRSolution <- function(df) {
df <- df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
## These two lines were added by me to remove "duplicate" rows
df <- mutate(df, Code=ifelse(Code.x < Code.y, paste(Code.x, Code.y), paste(Code.y, Code.x)))
df[which(!duplicated(df$Code)), ]
}
以下是新指标:
system.time(t4 <- DPLYRSolution(TestDF))
user system elapsed
0.03 0.00 0.03 ### Wow!!! really fast
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF),
Carl = DPLYRSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 437.87235 442.7348 450.91085 452.77204 457.09465 461.85035 10
Psidom 407.81519 416.9444 422.62793 425.26041 429.02064 434.38881 10
Carl 44.33698 44.8066 48.39051 45.35073 54.06513 59.35653 10
## Equality Check
identical(sort(c(t4[,3])[[1]]), sort(t1[,2]))
[1] TRUE
仅使用基础:
df <- data.frame(ID=c(1,1,1,2,2,2,3,3,3,3,4,4),
code=c("A", "B", "C", "B", "C", "D", "C", "A", "D", "B", "D", "B"), stringsAsFactors =FALSE)
# Create data.frame of unique combinations of codes
e <- expand.grid(df$code, df$code)
e <- e[e[,1]!=e[,2],]
e1 <- as.data.frame(unique(t(apply(e, 1, sort))), stringsAsFactors = FALSE)
# Count the occurrence of each code combination across IDs
e1$count <- apply(e1, 1, function(y)
sum(sapply(unique(df$ID), function(x)
sum(y[1] %in% df$code[df$ID==x] & y[2] %in% df$code[df$ID==x]))))
# Turn the codes into a string and print output
out <- data.frame(Code.Combinations=do.call(paste, c(e1[,1:2], sep=", ")),
Count.of.ID=e1$count, stringsAsFactors = FALSE)
out
# Code.Combinations Count.of.ID
# 1 A, B 2
# 2 A, C 2
# 3 A, D 1
# 4 B, C 3
# 5 B, D 3
# 6 C, D 2
假设我有以下数据框:
ID Code
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 2 D
7 3 C
8 3 A
9 3 D
10 3 B
11 4 D
12 4 B
我想通过列 "Code":
的成对组合来获取列 "ID" 的唯一值的计数 Code.Combinations Count.of.ID
1 A, B 2
2 A, C 2
3 A, D 1
4 B, C 3
5 B, D 3
6 C, D 2
我已经在网上搜索了解决方案,目前还没有达到预期的效果。 任何帮助,将不胜感激。谢谢!
假设您的 data.frame 被命名为 df
并使用 dplyr
df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
自己加入df然后按组数
这是解决问题的data.table
方法。使用 combn
函数获取所有可能的代码组合,然后计算每个唯一的 ID CodeComb
:
library(data.table)
setDT(df)[, .(CodeComb = sapply(combn(Code, 2, simplify = F),
function(cmb) paste(sort(cmb), collapse = ", "))), .(ID)]
# list all combinations of Code for each ID
[, .(IdCount = .N), .(CodeComb)]
# count number of unique id for each code combination
# CodeComb IdCount
# 1: A, B 2
# 2: A, C 2
# 3: B, C 3
# 4: B, D 3
# 5: C, D 2
# 6: A, D 1
下面使用了 gtools
包中的 combinations
以及 plyr
包中的 count
。
library(gtools)
library(plyr)
PairWiseCombo <- function(df) {
myID <- df$ID
BreakDown <- rle(myID)
Unis <- BreakDown$values
numUnis <- BreakDown$lengths
Len <- length(Unis)
e <- cumsum(numUnis)
s <- c(1L, e + 1L)
## more efficient to generate outside of the "do.call(c, lapply(.."
## below. This allows me to reference a particular combination
## rather than re-generating the same combination multiple times
myCombs <- lapply(2:max(numUnis), function(x) combinations(x,2L))
tempDF <- plyr::count(do.call(c, lapply(1:Len, function(i) {
myRange <- s[i]:e[i]
combs <- myCombs[[numUnis[i]-1L]]
vapply(1:nrow(combs), function(j) paste(sort(df$Code[myRange[combs[j,]]]), collapse = ","), "A,D")
})))
names(tempDF) <- c("Code.Combinations", "Count.of.ID")
tempDF
}
以下是一些指标。我没有测试@Carl 的解决方案,因为它给出的结果与其他解决方案不同。
set.seed(537)
ID <- do.call(c, lapply(1:100, function(x) rep(x, sample(2:26,1))))
temp <- rle(ID)
Code <- do.call(c, lapply(1:100, function(x) LETTERS[sample(temp$lengths[x])]))
TestDF <- data.frame(ID, Code, stringsAsFactors = FALSE)
system.time(t1 <- Noah(TestDF))
user system elapsed
97.05 0.31 97.42
system.time(t2 <- DTSolution(TestDF))
user system elapsed
0.43 0.00 0.42
system.time(t3 <- PairWiseCombo(TestDF))
user system elapsed
0.42 0.00 0.42
identical(sort(t3[,2]),sort(t2$IdCount))
TRUE
identical(sort(t3[,2]),sort(t1[,2]))
TRUE
使用 microbenchmark
我们有:
library(microbenchmark)
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 420.1090 433.9471 442.0133 446.4880 450.4420 452.7852 10
Psidom 396.8444 413.4933 416.3315 418.5573 420.9669 423.6303 10
总体而言,@Psidom 提供的 data.table
解决方案是最快的(不足为奇)。我的解决方案和 data.table
解决方案在非常大的示例上表现相似。然而,@Noah 提供的解决方案非常占用内存,无法在更大的数据帧上进行测试。
sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
更新
在调整@Carl 的解决方案之后,dplyr
方法是迄今为止最快的。下面是代码(你会看到我改变了哪些部分):
DPLYRSolution <- function(df) {
df <- df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
## These two lines were added by me to remove "duplicate" rows
df <- mutate(df, Code=ifelse(Code.x < Code.y, paste(Code.x, Code.y), paste(Code.y, Code.x)))
df[which(!duplicated(df$Code)), ]
}
以下是新指标:
system.time(t4 <- DPLYRSolution(TestDF))
user system elapsed
0.03 0.00 0.03 ### Wow!!! really fast
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF),
Carl = DPLYRSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 437.87235 442.7348 450.91085 452.77204 457.09465 461.85035 10
Psidom 407.81519 416.9444 422.62793 425.26041 429.02064 434.38881 10
Carl 44.33698 44.8066 48.39051 45.35073 54.06513 59.35653 10
## Equality Check
identical(sort(c(t4[,3])[[1]]), sort(t1[,2]))
[1] TRUE
仅使用基础:
df <- data.frame(ID=c(1,1,1,2,2,2,3,3,3,3,4,4),
code=c("A", "B", "C", "B", "C", "D", "C", "A", "D", "B", "D", "B"), stringsAsFactors =FALSE)
# Create data.frame of unique combinations of codes
e <- expand.grid(df$code, df$code)
e <- e[e[,1]!=e[,2],]
e1 <- as.data.frame(unique(t(apply(e, 1, sort))), stringsAsFactors = FALSE)
# Count the occurrence of each code combination across IDs
e1$count <- apply(e1, 1, function(y)
sum(sapply(unique(df$ID), function(x)
sum(y[1] %in% df$code[df$ID==x] & y[2] %in% df$code[df$ID==x]))))
# Turn the codes into a string and print output
out <- data.frame(Code.Combinations=do.call(paste, c(e1[,1:2], sep=", ")),
Count.of.ID=e1$count, stringsAsFactors = FALSE)
out
# Code.Combinations Count.of.ID
# 1 A, B 2
# 2 A, C 2
# 3 A, D 1
# 4 B, C 3
# 5 B, D 3
# 6 C, D 2