计算动态列数中的行匹配数

Counting the number of row matches across a dynamic number of columns

目标:我有 table 条记录(例如人),其中包含动态数量的变量(例如,电子邮件、phone、生日、...、其他)。我想将每一行与其他每一行进行比较,并对匹配的变量数求和。

# Input
my_data <- tibble(person = c("A","B","C","E","F"),
                 email = c("A@me.com", "A@me.com", NA, NA, NA),
                 phone = c(NA, NA, NA, 801, 801),
               birthday = c("Jan1", "Jan1", NA, NA, NA))

# A tibble: 5 x 4
  person email    phone birthday
  <chr>  <chr>    <dbl> <chr>   
1 A      A@me.com    NA Jan1    
2 B      A@me.com    NA Jan1    
3 C      NA          NA NA      
4 E      NA         801 NA      
5 F      NA         801 NA  

使用tidyr::expand_grid,我可以获得所有记录组合。

# Munging
my_data_a <- my_data %>%
  rename_with(~str_c(., "_a"), everything())
my_data_b <- my_data %>%
  rename_with(~str_c(., "_b"), everything())
grid <- expand_grid(my_data_a, my_data_b)

# A tibble: 25 x 9
   person_a email_a  phone_a birthday_a person_b email_b  phone_b birthday_b total
   <chr>    <chr>      <dbl> <chr>      <chr>    <chr>      <dbl> <chr>      <int>
 1 A        A@me.com      NA Jan1       A        A@me.com      NA Jan1           2
 2 A        A@me.com      NA Jan1       B        A@me.com      NA Jan1           2
 3 A        A@me.com      NA Jan1       C        NA            NA NA             0
 4 A        A@me.com      NA Jan1       E        NA           801 NA             0
 5 A        A@me.com      NA Jan1       F        NA           801 NA             0
 6 B        A@me.com      NA Jan1       A        A@me.com      NA Jan1           2
 7 B        A@me.com      NA Jan1       B        A@me.com      NA Jan1           2
 8 B        A@me.com      NA Jan1       C        NA            NA NA             0
 9 B        A@me.com      NA Jan1       E        NA           801 NA             0
10 B        A@me.com      NA Jan1       F        NA           801 NA             0
# … with 15 more rows

现在我可以手动比较每个变量,但问题是我将拥有的不仅仅是电子邮件、phone、生日。

grid %>%
  mutate(email_match = email_a == email_b,
         phone_match = phone_a == phone_b,
         birthday_match = birthday_a == birthday_b) %>%
  mutate(across(everything(), ~replace_na(., 0)),
         total = email_match + phone_match + birthday_match) %>%
  select(person_a, person_b, total)

# Output
   person_a person_b total
   <chr>    <chr>    <dbl>
 1 A        A            2
 2 A        B            2
 3 A        C            0
 4 A        E            0
 5 A        F            0
 6 B        A            2
 7 B        B            2
 8 B        C            0
 9 B        E            0
10 B        F            0
# … with 15 more rows

这个可以用for循环暴力破解,但是数据集比较大:

# Brute force
a_col_start <- 2
a_col_end <- ncol(grid)/2
b_col_start <- a_col_end + 2
b_col_end <- ncol(grid)
for (i in 1:nrow(grid)) {
  grid[i,"total"] <- sum(grid[i,a_col_start:a_col_end] == grid[i,b_col_start:b_col_end], na.rm = TRUE)
}
grid %>%
  select(person_a, person_b, total)

如果您只想要独特的组合,您可以使用 combn() 获取所有成对组合,并将其用作 Map() 的输入以获得每对行的匹配总和。

people <- combn(my_data$person, 2)

match_finder <- function(x, y) {
  personx <- my_data[my_data$person == x, ]
  persony <- my_data[my_data$person == y, ]
  match_sum <- sum(personx == persony, na.rm = TRUE)
  list(person1 = as.character(x), person2 = as.character(y), match_sum = match_sum)
  }

output <- Map(match_finder, people[1, ], people[2, ], USE.NAMES = FALSE)

as.data.frame(do.call(rbind, output))

   person1 person2 match_sum
1        A       B         2
2        A       C         0
3        A       E         0
4        A       F         0
5        B       C         0
6        B       E         0
7        B       F         0
8        C       E         0
9        C       F         0
10       E       F         1

您可以根据需要使用包 purrr 中的 pmap 函数。这将使按元素比较两个向量(在同一行中)变得容易:

library(dplyr)
library(purrr)
library(stringr)


grid %>%
  mutate(total = pmap_dbl(grid, ~ sum(c(...)[str_detect(names(grid), "_a")][-1] == 
                        c(...)[str_detect(names(grid), "_b")][-1], na.rm = TRUE))) %>%
  select(contains("person"), total)


# A tibble: 25 x 3
   person_a person_b total
   <chr>    <chr>    <dbl>
 1 A        A            2
 2 A        B            2
 3 A        C            0
 4 A        E            0
 5 A        F            0
 6 B        A            2
 7 B        B            2
 8 B        C            0
 9 B        E            0
10 B        F            0
# ... with 15 more rows