计算动态列数中的行匹配数
Counting the number of row matches across a dynamic number of columns
目标:我有 table 条记录(例如人),其中包含动态数量的变量(例如,电子邮件、phone、生日、...、其他)。我想将每一行与其他每一行进行比较,并对匹配的变量数求和。
# Input
my_data <- tibble(person = c("A","B","C","E","F"),
email = c("A@me.com", "A@me.com", NA, NA, NA),
phone = c(NA, NA, NA, 801, 801),
birthday = c("Jan1", "Jan1", NA, NA, NA))
# A tibble: 5 x 4
person email phone birthday
<chr> <chr> <dbl> <chr>
1 A A@me.com NA Jan1
2 B A@me.com NA Jan1
3 C NA NA NA
4 E NA 801 NA
5 F NA 801 NA
使用tidyr::expand_grid
,我可以获得所有记录组合。
# Munging
my_data_a <- my_data %>%
rename_with(~str_c(., "_a"), everything())
my_data_b <- my_data %>%
rename_with(~str_c(., "_b"), everything())
grid <- expand_grid(my_data_a, my_data_b)
# A tibble: 25 x 9
person_a email_a phone_a birthday_a person_b email_b phone_b birthday_b total
<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <int>
1 A A@me.com NA Jan1 A A@me.com NA Jan1 2
2 A A@me.com NA Jan1 B A@me.com NA Jan1 2
3 A A@me.com NA Jan1 C NA NA NA 0
4 A A@me.com NA Jan1 E NA 801 NA 0
5 A A@me.com NA Jan1 F NA 801 NA 0
6 B A@me.com NA Jan1 A A@me.com NA Jan1 2
7 B A@me.com NA Jan1 B A@me.com NA Jan1 2
8 B A@me.com NA Jan1 C NA NA NA 0
9 B A@me.com NA Jan1 E NA 801 NA 0
10 B A@me.com NA Jan1 F NA 801 NA 0
# … with 15 more rows
现在我可以手动比较每个变量,但问题是我将拥有的不仅仅是电子邮件、phone、生日。
grid %>%
mutate(email_match = email_a == email_b,
phone_match = phone_a == phone_b,
birthday_match = birthday_a == birthday_b) %>%
mutate(across(everything(), ~replace_na(., 0)),
total = email_match + phone_match + birthday_match) %>%
select(person_a, person_b, total)
# Output
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# … with 15 more rows
这个可以用for循环暴力破解,但是数据集比较大:
# Brute force
a_col_start <- 2
a_col_end <- ncol(grid)/2
b_col_start <- a_col_end + 2
b_col_end <- ncol(grid)
for (i in 1:nrow(grid)) {
grid[i,"total"] <- sum(grid[i,a_col_start:a_col_end] == grid[i,b_col_start:b_col_end], na.rm = TRUE)
}
grid %>%
select(person_a, person_b, total)
如果您只想要独特的组合,您可以使用 combn()
获取所有成对组合,并将其用作 Map()
的输入以获得每对行的匹配总和。
people <- combn(my_data$person, 2)
match_finder <- function(x, y) {
personx <- my_data[my_data$person == x, ]
persony <- my_data[my_data$person == y, ]
match_sum <- sum(personx == persony, na.rm = TRUE)
list(person1 = as.character(x), person2 = as.character(y), match_sum = match_sum)
}
output <- Map(match_finder, people[1, ], people[2, ], USE.NAMES = FALSE)
as.data.frame(do.call(rbind, output))
person1 person2 match_sum
1 A B 2
2 A C 0
3 A E 0
4 A F 0
5 B C 0
6 B E 0
7 B F 0
8 C E 0
9 C F 0
10 E F 1
您可以根据需要使用包 purrr
中的 pmap
函数。这将使按元素比较两个向量(在同一行中)变得容易:
library(dplyr)
library(purrr)
library(stringr)
grid %>%
mutate(total = pmap_dbl(grid, ~ sum(c(...)[str_detect(names(grid), "_a")][-1] ==
c(...)[str_detect(names(grid), "_b")][-1], na.rm = TRUE))) %>%
select(contains("person"), total)
# A tibble: 25 x 3
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# ... with 15 more rows
目标:我有 table 条记录(例如人),其中包含动态数量的变量(例如,电子邮件、phone、生日、...、其他)。我想将每一行与其他每一行进行比较,并对匹配的变量数求和。
# Input
my_data <- tibble(person = c("A","B","C","E","F"),
email = c("A@me.com", "A@me.com", NA, NA, NA),
phone = c(NA, NA, NA, 801, 801),
birthday = c("Jan1", "Jan1", NA, NA, NA))
# A tibble: 5 x 4
person email phone birthday
<chr> <chr> <dbl> <chr>
1 A A@me.com NA Jan1
2 B A@me.com NA Jan1
3 C NA NA NA
4 E NA 801 NA
5 F NA 801 NA
使用tidyr::expand_grid
,我可以获得所有记录组合。
# Munging
my_data_a <- my_data %>%
rename_with(~str_c(., "_a"), everything())
my_data_b <- my_data %>%
rename_with(~str_c(., "_b"), everything())
grid <- expand_grid(my_data_a, my_data_b)
# A tibble: 25 x 9
person_a email_a phone_a birthday_a person_b email_b phone_b birthday_b total
<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <int>
1 A A@me.com NA Jan1 A A@me.com NA Jan1 2
2 A A@me.com NA Jan1 B A@me.com NA Jan1 2
3 A A@me.com NA Jan1 C NA NA NA 0
4 A A@me.com NA Jan1 E NA 801 NA 0
5 A A@me.com NA Jan1 F NA 801 NA 0
6 B A@me.com NA Jan1 A A@me.com NA Jan1 2
7 B A@me.com NA Jan1 B A@me.com NA Jan1 2
8 B A@me.com NA Jan1 C NA NA NA 0
9 B A@me.com NA Jan1 E NA 801 NA 0
10 B A@me.com NA Jan1 F NA 801 NA 0
# … with 15 more rows
现在我可以手动比较每个变量,但问题是我将拥有的不仅仅是电子邮件、phone、生日。
grid %>%
mutate(email_match = email_a == email_b,
phone_match = phone_a == phone_b,
birthday_match = birthday_a == birthday_b) %>%
mutate(across(everything(), ~replace_na(., 0)),
total = email_match + phone_match + birthday_match) %>%
select(person_a, person_b, total)
# Output
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# … with 15 more rows
这个可以用for循环暴力破解,但是数据集比较大:
# Brute force
a_col_start <- 2
a_col_end <- ncol(grid)/2
b_col_start <- a_col_end + 2
b_col_end <- ncol(grid)
for (i in 1:nrow(grid)) {
grid[i,"total"] <- sum(grid[i,a_col_start:a_col_end] == grid[i,b_col_start:b_col_end], na.rm = TRUE)
}
grid %>%
select(person_a, person_b, total)
如果您只想要独特的组合,您可以使用 combn()
获取所有成对组合,并将其用作 Map()
的输入以获得每对行的匹配总和。
people <- combn(my_data$person, 2)
match_finder <- function(x, y) {
personx <- my_data[my_data$person == x, ]
persony <- my_data[my_data$person == y, ]
match_sum <- sum(personx == persony, na.rm = TRUE)
list(person1 = as.character(x), person2 = as.character(y), match_sum = match_sum)
}
output <- Map(match_finder, people[1, ], people[2, ], USE.NAMES = FALSE)
as.data.frame(do.call(rbind, output))
person1 person2 match_sum
1 A B 2
2 A C 0
3 A E 0
4 A F 0
5 B C 0
6 B E 0
7 B F 0
8 C E 0
9 C F 0
10 E F 1
您可以根据需要使用包 purrr
中的 pmap
函数。这将使按元素比较两个向量(在同一行中)变得容易:
library(dplyr)
library(purrr)
library(stringr)
grid %>%
mutate(total = pmap_dbl(grid, ~ sum(c(...)[str_detect(names(grid), "_a")][-1] ==
c(...)[str_detect(names(grid), "_b")][-1], na.rm = TRUE))) %>%
select(contains("person"), total)
# A tibble: 25 x 3
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# ... with 15 more rows