基于R中的多列匹配数据框
match data frames based on multiple columns in R
我有两个看起来像这样的巨大数据集。
df2 中有一个水果,PEACH,由于某种原因从 df1 中丢失了。
我想在df1中添加缺少的水果。
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
由 reprex package (v2.0.1)
于 2022-03-20 创建
我希望我的数据看起来像这样
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
非常感谢任何帮助或建议
您可以直接使用 df1
中的一组水果来过滤 df2
,然后将它们绑定在一起。
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
您可以通过拆分逗号 fruits
和 counts
变量来获得长格式的数据集,用 df2
执行 full_join
,调整 NA
值并为每个 central_fruit
折叠值。
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000
请在下面找到一种可能的 data.table
方法。
Reprex
- 代码
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
- 输出
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
由 reprex package (v2.0.1)
于 2022 年 3 月 20 日创建
我有两个看起来像这样的巨大数据集。
df2 中有一个水果,PEACH,由于某种原因从 df1 中丢失了。 我想在df1中添加缺少的水果。
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
由 reprex package (v2.0.1)
于 2022-03-20 创建我希望我的数据看起来像这样
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
非常感谢任何帮助或建议
您可以直接使用 df1
中的一组水果来过滤 df2
,然后将它们绑定在一起。
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
您可以通过拆分逗号 fruits
和 counts
变量来获得长格式的数据集,用 df2
执行 full_join
,调整 NA
值并为每个 central_fruit
折叠值。
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000
请在下面找到一种可能的 data.table
方法。
Reprex
- 代码
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
- 输出
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
由 reprex package (v2.0.1)
于 2022 年 3 月 20 日创建