数据框的行频率忽略 R 中的列顺序
Row frequency of a data frame ignoring column order in R
我想为数据框的行建立频率 table。
我已经找到了方法,但要考虑列的顺序。我希望找到忽略列顺序的频率。
例如:
0 A B
1 B A
2 C D
3 D C
4 C D
我希望获得:
A B 2
C D 3
提前致谢。
首先sort
按行排列,然后按所有列分组并计算行数。
library(dplyr)
df1 <- data.frame(t(apply(df[-1], 1, sort)))
df1 %>%
group_by_all() %>%
summarise(Freq = n())
# X1 X2 Freq
# <fct> <fct> <int>
#1 A B 2
#2 C D 3
数据
df <- structure(list(V1 = 0:4, V2 = structure(c(1L, 2L, 3L, 4L, 3L),
.Label = c("A",
"B", "C", "D"), class = "factor"), V3 = structure(c(2L, 1L, 4L,
3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor")), class =
"data.frame", row.names = c(NA,
-5L))
library("tidyverse")
x <- read.table(
text = "0 A B
1 B A
2 C D
3 D C
4 C D",
stringsAsFactors = FALSE)
x %>%
# Specify the columns to combine explicitly (here V2 and V3)
# Then sort each pair and paste it into a single string
mutate(pair = pmap_chr(list(V2, V3),
function(...) paste(sort(c(...)), collapse = " "))) %>%
count(pair)
#> # A tibble: 2 x 2
#> pair n
#> <chr> <int>
#> 1 A B 2
#> 2 C D 3
由 reprex package (v0.2.1)
创建于 2019-03-29
我们可以使用pmin/pmax
来创建分组变量并且应该更有效
library(dplyr)
df %>%
count(V2N = pmin(V2, V3), V3N = pmax(V2, V3))
# A tibble: 2 x 3
# V2N V3N n
# <chr> <chr> <int>
#1 A B 2
#2 C D 3
基准
df1 <- df[rep(seq_len(nrow(df)), 1e6),]
system.time({
df1 %>%
count(V2N = pmin(V2, V3), V3N = pmax(V2, V3))
})
#user system elapsed
# 1.164 0.043 1.203
system.time({
df2 <- data.frame(t(apply(df1[-1], 1, sort)))
df2 %>%
group_by_all() %>%
summarise(Freq = n())
})
# user system elapsed
# 160.357 1.227 161.544
数据
df <- structure(list(V1 = 0:4, V2 = c("A", "B", "C", "D", "C"), V3 = c("B",
"A", "D", "C", "D")), row.names = c(NA, -5L), class = "data.frame")
我想为数据框的行建立频率 table。
我已经找到了方法,但要考虑列的顺序。我希望找到忽略列顺序的频率。
例如:
0 A B
1 B A
2 C D
3 D C
4 C D
我希望获得:
A B 2
C D 3
提前致谢。
首先sort
按行排列,然后按所有列分组并计算行数。
library(dplyr)
df1 <- data.frame(t(apply(df[-1], 1, sort)))
df1 %>%
group_by_all() %>%
summarise(Freq = n())
# X1 X2 Freq
# <fct> <fct> <int>
#1 A B 2
#2 C D 3
数据
df <- structure(list(V1 = 0:4, V2 = structure(c(1L, 2L, 3L, 4L, 3L),
.Label = c("A",
"B", "C", "D"), class = "factor"), V3 = structure(c(2L, 1L, 4L,
3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor")), class =
"data.frame", row.names = c(NA,
-5L))
library("tidyverse")
x <- read.table(
text = "0 A B
1 B A
2 C D
3 D C
4 C D",
stringsAsFactors = FALSE)
x %>%
# Specify the columns to combine explicitly (here V2 and V3)
# Then sort each pair and paste it into a single string
mutate(pair = pmap_chr(list(V2, V3),
function(...) paste(sort(c(...)), collapse = " "))) %>%
count(pair)
#> # A tibble: 2 x 2
#> pair n
#> <chr> <int>
#> 1 A B 2
#> 2 C D 3
由 reprex package (v0.2.1)
创建于 2019-03-29我们可以使用pmin/pmax
来创建分组变量并且应该更有效
library(dplyr)
df %>%
count(V2N = pmin(V2, V3), V3N = pmax(V2, V3))
# A tibble: 2 x 3
# V2N V3N n
# <chr> <chr> <int>
#1 A B 2
#2 C D 3
基准
df1 <- df[rep(seq_len(nrow(df)), 1e6),]
system.time({
df1 %>%
count(V2N = pmin(V2, V3), V3N = pmax(V2, V3))
})
#user system elapsed
# 1.164 0.043 1.203
system.time({
df2 <- data.frame(t(apply(df1[-1], 1, sort)))
df2 %>%
group_by_all() %>%
summarise(Freq = n())
})
# user system elapsed
# 160.357 1.227 161.544
数据
df <- structure(list(V1 = 0:4, V2 = c("A", "B", "C", "D", "C"), V3 = c("B",
"A", "D", "C", "D")), row.names = c(NA, -5L), class = "data.frame")