部分加入两个 dataframes/tibbles
Partially join two dataframes/tibbles
我有两个data-frames/tibbles。第一个是国家列表,其中包含描述这些国家的多个变量。该数据框包含多个缺失值。缺少哪些变量取决于国家/地区。
library(tidyverse)
df1<-data.frame(id=1:10,
country=c("A","A","A","A","B","B","C","C","C","C"),
var1=c(NA,NA,NA,NA,1,1,2,1,2,1),
var2=c(1,1,2,2,NA,NA,1,2,2,2),
var3=c("NO","YES","NO","YES","NO","NO",NA,NA,NA,NA),
var4=c(NA,NA,NA,NA,"NO","NO",NA,NA,NA,NA)
)
df1<-as_tibble(df1)
然后我有第二个数据框 (df2) 我想加入第一个 table:
df2<-data.frame(id=c(2,3,5,6,7,8,9,10),
country=c("A", "A", "B", "B", "C", "C", "C", "C"),
var1=c(1,2,2,2,2,1,2,1),
var2=c(2,1,1,1,1,2,1,1),
var3=c("NO","NO", "YES", "NO", "NO", "NO", "YES","NO"),
var4=c("YES", "NO", "NO", "YES", "YES", "NO", "NO", "YES")
)
df2<-as_tibble(df2)
最后,我想要的是第一个数据帧,其中缺失值使用第二个数据帧完成。所以我想使用 id
-Variable 加入两个 tables。但是,此连接应该仅为 "partly",因为缺少哪些变量取决于国家/地区:例如对于国家 "A",仅应填写变量 var1
和 var4
。对于国家 "C",应填写变量 var3
和 var4
来自 df2。 df1 包含的案例比 df2 多。
谁能告诉我哪个是解决该问题的最佳解决方案?
非常感谢!
这是针对所提供数据的一种可能解决方案。我将 stringsAsFactors = FALSE
添加到两个数据框中。看到数据,我以为你会想要绑定它们而不是加入它们。绑定数据后,我将其按 id
、country
和 index
排序。 index
表示数据来自哪个数据框。然后,我按 id
和 country
创建了组。对于有两行的组,第一行包含您要填写的目标 NA。这些 NA 位于四列中(即 var1-4
)。我在 zoo
包中应用了 na.locf()
来完成填充过程。对于每个组,第一行来自 df1
,您希望保留它们。这里我选择使用distinct()
。但是 slice(1)
是另一种选择。
df1 <- data.frame(id=1:10,
country=c("A","A","A","A","B","B","C","C","C","C"),
var1=c(NA,NA,NA,NA,1,1,2,1,2,1),
var2=c(1,1,2,2,NA,NA,1,2,2,2),
var3=c("NO","YES","NO","YES","NO","NO",NA,NA,NA,NA),
var4=c(NA,NA,NA,NA,"NO","NO",NA,NA,NA,NA),
stringsAsFactors = F)
df2 <- data.frame(id=c(2,3,5,6,7,8,9,10),
country=c("A", "A", "B", "B", "C", "C", "C", "C"),
var1=c(1,2,2,2,2,1,2,1),
var2=c(2,1,1,1,1,2,1,1),
var3=c("NO","NO", "YES", "NO", "NO", "NO", "YES","NO"),
var4=c("YES", "NO", "NO", "YES", "YES", "NO", "NO", "YES"),
stringsAsFactors = F)
library(dplyr)
library(zoo)
bind_rows(df1, df2, .id = "index") %>%
arrange(id, country, index) %>%
group_by(id, country) %>%
mutate_at(vars(var1:var4), funs(if(n() > 1) {na.locf(., fromLast = TRUE)} else {.})) %>%
distinct(id, .keep_all = TRUE) %>%
select(-index)
id country var1 var2 var3 var4
<dbl> <chr> <dbl> <dbl> <chr> <chr>
1 1.00 A NA 1.00 NO <NA>
2 2.00 A 1.00 1.00 YES YES
3 3.00 A 2.00 2.00 NO NO
4 4.00 A NA 2.00 YES <NA>
5 5.00 B 1.00 1.00 NO NO
6 6.00 B 1.00 1.00 NO NO
7 7.00 C 2.00 1.00 NO YES
8 8.00 C 1.00 2.00 NO NO
9 9.00 C 2.00 2.00 YES NO
10 10.0 C 1.00 2.00 NO YES
保留类型但需要一点文字代码的更新建议。
rename_at(df2, vars(starts_with("var")), ~ paste0("new", .)) %>%
select(-country) %>%
right_join(df1, by = "id") %>%
mutate(
var1 = if_else(is.na(var1), newvar1, var1),
var2 = if_else(is.na(var2), newvar2, var2),
var3 = if_else(is.na(var3), newvar3, var3),
var4 = if_else(is.na(var4), newvar4, var4)
) %>%
select(-starts_with("newvar"))
另一种使用方法是在相关列名上循环(在管道外部):
df3 <- rename_at(df2, vars(starts_with("var")), ~ paste0("new", .)) %>%
select(-country) %>%
right_join(df1, by = "id")
for (v in colnames(df1)[ grepl("^var", colnames(df1)) ]) {
df3[[v]] <- if_else(is.na(df3[[v]]), df3[[ paste0("new", v) ]], df3[[v]])
}
select(df3, -starts_with("newvar"))
编辑:糟糕,刚刚意识到 "var" 列是混合类型。如果所有内容都相同,则以下答案有效,但此处不适用。使用前面的代码保留类型。
如果重命名 df2
中的 "var" 变量,则可以对 df1
的 "var" 变量进行并排比较和重新分配。一种方法可能是使用 dplyr::mutate_if
和 starts_with("var")
,但这对您的数据提出了可能过于严格的要求。
我建议使用中间 "tall"(相对于 "wide")格式来处理 var1
到 var4
的问题;这样,如果您实际上有更多变量,则无需遍历每个变量。
假设:df2$id
应该足够了,id$country
是不必要的。
library(dplyr)
library(tidyr)
df1<-data_frame(id=1:10,
country=c("A","A","A","A","B","B","C","C","C","C"),
var1=c(NA,NA,NA,NA,1,1,2,1,2,1),
var2=c(1,1,2,2,NA,NA,1,2,2,2),
var3=c("NO","YES","NO","YES","NO","NO",NA,NA,NA,NA),
var4=c(NA,NA,NA,NA,"NO","NO",NA,NA,NA,NA)
)
df2<-data_frame(id=c(2,3,5,6,7,8,9,10),
country=c("A", "A", "B", "B", "C", "C", "C", "C"),
var1=c(1,2,2,2,2,1,2,1),
var2=c(2,1,1,1,1,2,1,1),
var3=c("NO","NO", "YES", "NO", "NO", "NO", "YES","NO"),
var4=c("YES", "NO", "NO", "YES", "YES", "NO", "NO", "YES")
)
select(df2, -country) %>%
gather(k, newv, -id) %>%
right_join(gather(df1, k, v, -id, -country), by = c("id", "k")) %>%
mutate(v = ifelse(is.na(v), newv, v)) %>%
select(-newv) %>%
spread(k, v)
# # A tibble: 10 × 6
# id country var1 var2 var3 var4
# * <dbl> <chr> <chr> <chr> <chr> <chr>
# 1 1 A <NA> 1 NO <NA>
# 2 2 A 1 1 YES YES
# 3 3 A 2 2 NO NO
# 4 4 A <NA> 2 YES <NA>
# 5 5 B 1 1 NO NO
# 6 6 B 1 1 NO NO
# 7 7 C 2 1 NO YES
# 8 8 C 1 2 NO NO
# 9 9 C 2 2 YES NO
# 10 10 C 1 2 NO YES
我有两个data-frames/tibbles。第一个是国家列表,其中包含描述这些国家的多个变量。该数据框包含多个缺失值。缺少哪些变量取决于国家/地区。
library(tidyverse)
df1<-data.frame(id=1:10,
country=c("A","A","A","A","B","B","C","C","C","C"),
var1=c(NA,NA,NA,NA,1,1,2,1,2,1),
var2=c(1,1,2,2,NA,NA,1,2,2,2),
var3=c("NO","YES","NO","YES","NO","NO",NA,NA,NA,NA),
var4=c(NA,NA,NA,NA,"NO","NO",NA,NA,NA,NA)
)
df1<-as_tibble(df1)
然后我有第二个数据框 (df2) 我想加入第一个 table:
df2<-data.frame(id=c(2,3,5,6,7,8,9,10),
country=c("A", "A", "B", "B", "C", "C", "C", "C"),
var1=c(1,2,2,2,2,1,2,1),
var2=c(2,1,1,1,1,2,1,1),
var3=c("NO","NO", "YES", "NO", "NO", "NO", "YES","NO"),
var4=c("YES", "NO", "NO", "YES", "YES", "NO", "NO", "YES")
)
df2<-as_tibble(df2)
最后,我想要的是第一个数据帧,其中缺失值使用第二个数据帧完成。所以我想使用 id
-Variable 加入两个 tables。但是,此连接应该仅为 "partly",因为缺少哪些变量取决于国家/地区:例如对于国家 "A",仅应填写变量 var1
和 var4
。对于国家 "C",应填写变量 var3
和 var4
来自 df2。 df1 包含的案例比 df2 多。
谁能告诉我哪个是解决该问题的最佳解决方案?
非常感谢!
这是针对所提供数据的一种可能解决方案。我将 stringsAsFactors = FALSE
添加到两个数据框中。看到数据,我以为你会想要绑定它们而不是加入它们。绑定数据后,我将其按 id
、country
和 index
排序。 index
表示数据来自哪个数据框。然后,我按 id
和 country
创建了组。对于有两行的组,第一行包含您要填写的目标 NA。这些 NA 位于四列中(即 var1-4
)。我在 zoo
包中应用了 na.locf()
来完成填充过程。对于每个组,第一行来自 df1
,您希望保留它们。这里我选择使用distinct()
。但是 slice(1)
是另一种选择。
df1 <- data.frame(id=1:10,
country=c("A","A","A","A","B","B","C","C","C","C"),
var1=c(NA,NA,NA,NA,1,1,2,1,2,1),
var2=c(1,1,2,2,NA,NA,1,2,2,2),
var3=c("NO","YES","NO","YES","NO","NO",NA,NA,NA,NA),
var4=c(NA,NA,NA,NA,"NO","NO",NA,NA,NA,NA),
stringsAsFactors = F)
df2 <- data.frame(id=c(2,3,5,6,7,8,9,10),
country=c("A", "A", "B", "B", "C", "C", "C", "C"),
var1=c(1,2,2,2,2,1,2,1),
var2=c(2,1,1,1,1,2,1,1),
var3=c("NO","NO", "YES", "NO", "NO", "NO", "YES","NO"),
var4=c("YES", "NO", "NO", "YES", "YES", "NO", "NO", "YES"),
stringsAsFactors = F)
library(dplyr)
library(zoo)
bind_rows(df1, df2, .id = "index") %>%
arrange(id, country, index) %>%
group_by(id, country) %>%
mutate_at(vars(var1:var4), funs(if(n() > 1) {na.locf(., fromLast = TRUE)} else {.})) %>%
distinct(id, .keep_all = TRUE) %>%
select(-index)
id country var1 var2 var3 var4
<dbl> <chr> <dbl> <dbl> <chr> <chr>
1 1.00 A NA 1.00 NO <NA>
2 2.00 A 1.00 1.00 YES YES
3 3.00 A 2.00 2.00 NO NO
4 4.00 A NA 2.00 YES <NA>
5 5.00 B 1.00 1.00 NO NO
6 6.00 B 1.00 1.00 NO NO
7 7.00 C 2.00 1.00 NO YES
8 8.00 C 1.00 2.00 NO NO
9 9.00 C 2.00 2.00 YES NO
10 10.0 C 1.00 2.00 NO YES
保留类型但需要一点文字代码的更新建议。
rename_at(df2, vars(starts_with("var")), ~ paste0("new", .)) %>%
select(-country) %>%
right_join(df1, by = "id") %>%
mutate(
var1 = if_else(is.na(var1), newvar1, var1),
var2 = if_else(is.na(var2), newvar2, var2),
var3 = if_else(is.na(var3), newvar3, var3),
var4 = if_else(is.na(var4), newvar4, var4)
) %>%
select(-starts_with("newvar"))
另一种使用方法是在相关列名上循环(在管道外部):
df3 <- rename_at(df2, vars(starts_with("var")), ~ paste0("new", .)) %>%
select(-country) %>%
right_join(df1, by = "id")
for (v in colnames(df1)[ grepl("^var", colnames(df1)) ]) {
df3[[v]] <- if_else(is.na(df3[[v]]), df3[[ paste0("new", v) ]], df3[[v]])
}
select(df3, -starts_with("newvar"))
编辑:糟糕,刚刚意识到 "var" 列是混合类型。如果所有内容都相同,则以下答案有效,但此处不适用。使用前面的代码保留类型。
如果重命名 df2
中的 "var" 变量,则可以对 df1
的 "var" 变量进行并排比较和重新分配。一种方法可能是使用 dplyr::mutate_if
和 starts_with("var")
,但这对您的数据提出了可能过于严格的要求。
我建议使用中间 "tall"(相对于 "wide")格式来处理 var1
到 var4
的问题;这样,如果您实际上有更多变量,则无需遍历每个变量。
假设:df2$id
应该足够了,id$country
是不必要的。
library(dplyr)
library(tidyr)
df1<-data_frame(id=1:10,
country=c("A","A","A","A","B","B","C","C","C","C"),
var1=c(NA,NA,NA,NA,1,1,2,1,2,1),
var2=c(1,1,2,2,NA,NA,1,2,2,2),
var3=c("NO","YES","NO","YES","NO","NO",NA,NA,NA,NA),
var4=c(NA,NA,NA,NA,"NO","NO",NA,NA,NA,NA)
)
df2<-data_frame(id=c(2,3,5,6,7,8,9,10),
country=c("A", "A", "B", "B", "C", "C", "C", "C"),
var1=c(1,2,2,2,2,1,2,1),
var2=c(2,1,1,1,1,2,1,1),
var3=c("NO","NO", "YES", "NO", "NO", "NO", "YES","NO"),
var4=c("YES", "NO", "NO", "YES", "YES", "NO", "NO", "YES")
)
select(df2, -country) %>%
gather(k, newv, -id) %>%
right_join(gather(df1, k, v, -id, -country), by = c("id", "k")) %>%
mutate(v = ifelse(is.na(v), newv, v)) %>%
select(-newv) %>%
spread(k, v)
# # A tibble: 10 × 6
# id country var1 var2 var3 var4
# * <dbl> <chr> <chr> <chr> <chr> <chr>
# 1 1 A <NA> 1 NO <NA>
# 2 2 A 1 1 YES YES
# 3 3 A 2 2 NO NO
# 4 4 A <NA> 2 YES <NA>
# 5 5 B 1 1 NO NO
# 6 6 B 1 1 NO NO
# 7 7 C 2 1 NO YES
# 8 8 C 1 2 NO NO
# 9 9 C 2 2 YES NO
# 10 10 C 1 2 NO YES