将匹配 NA 的 df 的字符行与所有内容进行比较,并根据比较创建新列或 df

Compare character rows of a df matching NA to everything and create new column or df based on comparison

我有一个包含字符值的非常大的数据框。我想将行相互比较并根据比较创建 ID。问题是 df 中有 NA,我希望将它们评估为匹配任何值。 另一个问题是 ID 也需要在同一步骤中创建(或者我以过于复杂的方式考虑问题)。

这是我创建的玩具 df:

library(tidyverse)
library(purrr)

# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "G", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets

     Var1 Var2 Var3 Var4
Set1 "A"  "B"  "C"  "A" 
Set2 "A"  "D"  "B"  "A" 
Set3 NA   "B"  "C"  "A" 
Set4 "A"  "D"  "B"  "A" 
Set5 "F"  "G"  NA   "F" 
Set6 "A"  "B"  "C"  "C" 

这是所需的输出,作为单独的 df 或作为新列,两者都一样好:

# as new column
    Var1 Var2 Var3 Var4 COMP
Set1 "A"  "B"  "C"  "A" "Group1"
Set2 "A"  "D"  "B"  "A" "Group2
Set3 NA   "B"  "C"  "A" "Group1"
Set4 "A"  "D"  "B"  "A" "Group3"
Set5 "F"  "G"  NA   "F" "Group4"
Set6 "A"  "B"  "C"  "C" "Group5"

# as new df
      COMP
Set1 "Group1"
Set2 "Group2
Set3 "Group1"
Set4 "Group3"
Set5 "Group4"
Set6 "Group5"

我认为这可以通过 rowwise()map 来实现,但即使阅读了类似的 我也无法弄清楚如何实现这一点,尤其是如何命名新的团体连续不断地。任何想法将不胜感激。

一个非常丑陋的 while 循环解决方案,但我认为它有效。

#Change sets to dataframe
sets <- data.frame(sets)
result <- integer(nrow(sets))
group_count <- 1
x <- 1

while(any(result == 0)) {
  a <- sets[-x, !is.na(sets[x, ])]
  b <- na.omit(unlist(sets[x, ]))
  inds <- which(rowSums(sweep(a, 2, as.matrix(b), `==`), na.rm = TRUE) == length(b))
  #If a complete match is found
  if(length(inds)) {
    #Need to adjust the match since we are dropping 1 row from original df
    if(all(inds > x)) {
      result[c(x, inds + 1)] <- group_count  
    } else {
      result[c(x, inds)] <- group_count  
    }
  } else {
    result[x] <- group_count
  }
  group_count <- group_count + 1
  #Get the next row number to check. 
  x <- which(result == 0)[1]
}

#Reset so you get counts in order 1, 2, 3...
result <- match(result, unique(result))
result
[1] 1 2 1 2 3 4

这里的逻辑是将每一行值与数据框中的每一行进行比较,删除它们的 NA 值,如果匹配,我们用 group_count 值更新这些行。

您可以在创建群组 ID 后进行一些模糊连接:

library(tidyverse)
library(fuzzyjoin)
library(stringdist)
#> 
#> Attaching package: 'stringdist'
#> The following object is masked from 'package:tidyr':
#> 
#>     extract

# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "D", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
#>      Var1 Var2 Var3 Var4
#> Set1 "A"  "B"  "C"  "A" 
#> Set2 "A"  "D"  "B"  "A" 
#> Set3 NA   "B"  "C"  "A" 
#> Set4 "A"  "D"  "B"  "A" 
#> Set5 "F"  "G"  NA   "F" 
#> Set6 "A"  "B"  "C"  "C"

elements <-
  sets %>%
  as_tibble() %>%
  pivot_longer(everything()) %>%
  pull(value) %>%
  unique() %>%
  discard(is.na)
elements
#> [1] "A" "B" "C" "D" "F" "G"

groups <-
  expand_grid(
    Var1 = elements,
    Var2 = elements,
    Var3 = elements,
    Var4 = elements
  ) %>%
    mutate(group = row_number() %>% paste0("Group", .)) %>%
    unite(group_str, starts_with("Var"))
groups
#> # A tibble: 1,296 × 2
#>    group_str group  
#>    <chr>     <chr>  
#>  1 A_A_A_A   Group1 
#>  2 A_A_A_B   Group2 
#>  3 A_A_A_C   Group3 
#>  4 A_A_A_D   Group4 
#>  5 A_A_A_F   Group5 
#>  6 A_A_A_G   Group6 
#>  7 A_A_B_A   Group7 
#>  8 A_A_B_B   Group8 
#>  9 A_A_B_C   Group9 
#> 10 A_A_B_D   Group10
#> # … with 1,286 more rows

匹配 iff 字符串 x 和 y 是准确的 但如果有一个 #

也允许少一个字符
compare <- function(x, y) {
    (
      stringdist(x, y) <= 1 & paste0(x, y) %>% str_count("#") == 1
    ) |
    (
      x == y
    )
}

sets %>%
  as_tibble(rownames = "set") %>%
  mutate_all(~ .x %>% replace_na("#")) %>%
  unite(group_str, starts_with("Var")) %>%
  fuzzy_left_join(groups, match_fun = compare)
#> Joining by: "group_str"
#> # A tibble: 16 × 4
#>    set   group_str.x group_str.y group    
#>    <chr> <chr>       <chr>       <chr>    
#>  1 Set1  A_B_C_A     A_B_C_A     Group49  
#>  2 Set2  A_D_B_A     A_D_B_A     Group115 
#>  3 Set3  #_B_C_A     A_B_C_A     Group49  
#>  4 Set3  #_B_C_A     B_B_C_A     Group265 
#>  5 Set3  #_B_C_A     C_B_C_A     Group481 
#>  6 Set3  #_B_C_A     D_B_C_A     Group697 
#>  7 Set3  #_B_C_A     F_B_C_A     Group913 
#>  8 Set3  #_B_C_A     G_B_C_A     Group1129
#>  9 Set4  A_D_B_A     A_D_B_A     Group115 
#> 10 Set5  F_G_#_F     F_G_A_F     Group1049
#> 11 Set5  F_G_#_F     F_G_B_F     Group1055
#> 12 Set5  F_G_#_F     F_G_C_F     Group1061
#> 13 Set5  F_G_#_F     F_G_D_F     Group1067
#> 14 Set5  F_G_#_F     F_G_F_F     Group1073
#> 15 Set5  F_G_#_F     F_G_G_F     Group1079
#> 16 Set6  A_B_C_C     A_B_C_C     Group51

reprex package (v2.0.1)

于 2021-09-25 创建

您可以将 NA 替换为 .,粘贴到字符串中并使用 grepl() 进行模式匹配。

library(magrittr)

sets <- as.data.frame(sets)

sets %>%
  replace(is.na(sets), ".") %>%
  do.call(paste0, .) %>%
  outer(., ., function(x, y) mapply(grepl, x, y)) %>%
  t() %>%
  max.col(ties.method = "last") %>%
  match(unique(.))

[1] 1 2 1 2 3 4

但是有可能将 NAs 视为 wild 将匹配多行,因此这样做可能更安全:

# Change Row 6 so both Row 6 and Row 1 match Row 3
Set6 <- c("B", "B", "C", "A")

sets %>%
  replace(is.na(sets), ".") %>%
  do.call(paste0, .) %>%
  outer(., ., function(x, y) mapply(grepl, x, y)) %>%
  apply(2, which)

[[1]]
[1] 1 3

[[2]]
[1] 2 4

[[3]]
[1] 3

[[4]]
[1] 2 4

[[5]]
[1] 5

[[6]]
[1] 3 6

这表明哪一行与另一行(包括它本身)匹配。