使用混合数据连接多个表
Joining multiple tables with mix data
我的数据:
data1 <- data.frame(from = c(1, 2, 13, 4),
to = c(4, 3, 9, 1),
values = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
我的结果:
from to values
1 4 12
2 3 56
13 9 67
4 1 78
place id
NY 1
London 2
Brest 3
Nantes 4
我期望使用 dplyr 包中的 join 函数(在新 table 中)
from to values
NY Nantes 12
London Brest 56
London NY 78
我尝试了什么:
data3<- inner_join (data1, data2, by =c("from" = "id", "to" = "id"))
data3
一些参考资料:
https://stat545-ubc.github.io/bit001_dplyr-cheatsheet.html
https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
一个更大的例子混合数据
假设我有 50 列包含地理数据 ("places") 和非地理数据(级别、值)
我不想更改 d.f.
的列顺序
我想保留我的列名
data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
value1 = c(4, 3, 9, 1),
firstplace = c(1, 2, 13, 4),
secondplace = c(1, 2, 2, 4),
value2 = c(78, 3000, 90, 101),
thirdplace =c(1, 1, 2, 4),
fourthplace=c(4, 4, 4, 4),
fifthplace=c(1, 2, 3, 4),
value3 = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
一个不同名称的例子(更复杂?)
我不想更改我的 d.f 的列顺序。
我想保留我的列名
data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
value1 = c(4, 3, 9, 1),
shops= c(1, 2, 13, 4),
after_sales_service = c(1, 2, 2, 4),
value2 = c(78, 3000, 90, 101),
provider =c(1, 1, 2, 4),
seller=c(4, 4, 4, 4),
maker=c(1, 2, 3, 4),
value3 = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
您可以使用 data2
作为查询来代替加入 table:
library(dplyr)
data1 <- data1 %>%
mutate(from = data2$place[match(from, data2$id)],
to = data2$place[match(to, data2$id)]) %>%
filter(complete.cases(.))
给出:
> data1
from to values
1 NY Nantes 12
2 London Brest 56
3 Nantes NY 78
data.table 包的替代解决方案:
library(data.table)
na.omit(setDT(data1)[, `:=` (from = data2$place[match(from, data2$id)],
to = data2$place[match(to, data2$id)])])
你也可以做一个双 left_join
:
data1 %>%
left_join(., data2, by = c("from"="id")) %>%
left_join(., data2, by = c("to"="id")) %>%
select(-c(1:2)) %>%
filter(complete.cases(.))
更新 1: 如果您有多个名称必须匹配的列,最好先将数据框转换为长格式。一个更大数据集的例子:
library(dplyr)
library(tidyr)
data1 %>%
gather(var, val, -values) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-3) %>%
filter(!is.na(place)) %>%
spread(var, place)
给出:
values fifthplace firstplace fourthplace from secondplace thirdplace to
1 12 NY NY Nantes NY NY NY Nantes
2 56 London London Nantes London London NY Brest
3 67 Brest <NA> Nantes <NA> London London <NA>
4 78 Nantes Nantes Nantes Nantes Nantes Nantes NY
使用 data.table 你可以:
library(data.table)
dcast(melt(setDT(data1),
id.vars = "values")[data2, on = c(value="id")],
values ~ variable, value.var = "place")
给你同样的结果。
更新 2: 针对问题的第二次更新,您可以使用 dplyr / tidyr:
data1 %>%
gather(var, val, c(firstplace,secondplace,thirdplace,fourthplace,fifthplace)) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-val) %>%
spread(var, place)
给出:
levels1 value1 value2 value3 fifthplace firstplace fourthplace secondplace thirdplace
1 name1 4 78 12 NY NY Nantes NY NY
2 name2 3 3000 56 London London Nantes London NY
3 name3 9 90 67 Brest <NA> Nantes London London
4 name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
或者用data.table:
mvars <- c("firstplace","secondplace","thirdplace","fourthplace","fifthplace")
dcast(melt(setDT(data1),
measure.vars = mvars)[data2, on = c(value="id")],
levels1 + value1 + value2 + value3 ~ variable, value.var = "place")
结果相同:
levels1 value1 value2 value3 firstplace secondplace thirdplace fourthplace fifthplace
1: name1 4 78 12 NY NY NY Nantes NY
2: name2 3 3000 56 London London NY Nantes London
3: name3 9 90 67 NA London London Nantes Brest
4: name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
更新3:如果你想使用索引号,你可以这样做:
# dplyr / tidyr
data1 %>%
gather(var, val, c(3,4,6:8)) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-val) %>%
spread(var, place)
# data.table
dcast(melt(setDT(data1),
measure.vars = c(3,4,6:8))[data2, on = c(value="id")],
levels1 + value1 + value2 + value3 ~ variable, value.var = "place")
这给出了(data.table 输出:
levels1 value1 value2 value3 shops after_sales_service provider seller maker
1: name1 4 78 12 NY NY NY Nantes NY
2: name2 3 3000 56 London London NY Nantes London
3: name3 9 90 67 NA London London Nantes Brest
4: name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
我们可以用 base R
merge(merge(data1, data2, by.x='from', by.y= 'id'),
data2, by.x='to', by.y='id')[-(1:2)]
更新
对于您的新数据集,我们可以在将数据集转换为 matrix
后使用 match
,而不使用 'values' 列
temp <- as.data.frame(`dim<-`(as.character(data2$place[
match(as.matrix(data1[-ncol(data1)]), data2$id)]),
dim(data1[-ncol(data1)])))
names(temp) <- head(names(data1),-1)
cbind(data1[ncol(data1)], temp)
# values from to firstplace secondplace thirdplace fourthplace fifthplace
#1 12 NY Nantes NY NY NY Nantes NY
#2 56 London Brest London London NY Nantes London
#3 67 <NA> <NA> <NA> London London Nantes Brest
#4 78 Nantes NY Nantes Nantes Nantes Nantes Nantes
更新2
基于 OP post
中的新更新
i1 <- grep('place', names(data1))
d1 <- as.data.frame(`dim<-`(as.character(data2$place[
match(as.matrix(data1[i1]), data2$id)]),
dim(data1[i1])), stringsAsFactors=FALSE)
d2 <- cbind(data1[-i1], setNames(d1, paste0('place', 1:ncol(d1))))
d2
# levels1 value1 value2 value3 place1 place2 place3 place4 place5
#1 name1 4 78 12 NY NY NY Nantes NY
#2 name2 3 3000 56 London London NY Nantes London
#3 name3 9 90 67 <NA> London London Nantes Brest
#4 name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
更新3
如果列名不同,只需更改第2步
d2 <- cbind(data1[-i1], setNames(d1, names(data1[i1])))
我的数据:
data1 <- data.frame(from = c(1, 2, 13, 4),
to = c(4, 3, 9, 1),
values = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
我的结果:
from to values
1 4 12
2 3 56
13 9 67
4 1 78
place id
NY 1
London 2
Brest 3
Nantes 4
我期望使用 dplyr 包中的 join 函数(在新 table 中)
from to values
NY Nantes 12
London Brest 56
London NY 78
我尝试了什么:
data3<- inner_join (data1, data2, by =c("from" = "id", "to" = "id"))
data3
一些参考资料:
https://stat545-ubc.github.io/bit001_dplyr-cheatsheet.html
https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
一个更大的例子混合数据
假设我有 50 列包含地理数据 ("places") 和非地理数据(级别、值)
我不想更改 d.f.
的列顺序
我想保留我的列名
data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
value1 = c(4, 3, 9, 1),
firstplace = c(1, 2, 13, 4),
secondplace = c(1, 2, 2, 4),
value2 = c(78, 3000, 90, 101),
thirdplace =c(1, 1, 2, 4),
fourthplace=c(4, 4, 4, 4),
fifthplace=c(1, 2, 3, 4),
value3 = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
一个不同名称的例子(更复杂?)
我不想更改我的 d.f 的列顺序。
我想保留我的列名
data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
value1 = c(4, 3, 9, 1),
shops= c(1, 2, 13, 4),
after_sales_service = c(1, 2, 2, 4),
value2 = c(78, 3000, 90, 101),
provider =c(1, 1, 2, 4),
seller=c(4, 4, 4, 4),
maker=c(1, 2, 3, 4),
value3 = c(12, 56, 67, 78))
data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
id = c(1, 2, 3, 4))
您可以使用 data2
作为查询来代替加入 table:
library(dplyr)
data1 <- data1 %>%
mutate(from = data2$place[match(from, data2$id)],
to = data2$place[match(to, data2$id)]) %>%
filter(complete.cases(.))
给出:
> data1
from to values
1 NY Nantes 12
2 London Brest 56
3 Nantes NY 78
data.table 包的替代解决方案:
library(data.table)
na.omit(setDT(data1)[, `:=` (from = data2$place[match(from, data2$id)],
to = data2$place[match(to, data2$id)])])
你也可以做一个双 left_join
:
data1 %>%
left_join(., data2, by = c("from"="id")) %>%
left_join(., data2, by = c("to"="id")) %>%
select(-c(1:2)) %>%
filter(complete.cases(.))
更新 1: 如果您有多个名称必须匹配的列,最好先将数据框转换为长格式。一个更大数据集的例子:
library(dplyr)
library(tidyr)
data1 %>%
gather(var, val, -values) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-3) %>%
filter(!is.na(place)) %>%
spread(var, place)
给出:
values fifthplace firstplace fourthplace from secondplace thirdplace to
1 12 NY NY Nantes NY NY NY Nantes
2 56 London London Nantes London London NY Brest
3 67 Brest <NA> Nantes <NA> London London <NA>
4 78 Nantes Nantes Nantes Nantes Nantes Nantes NY
使用 data.table 你可以:
library(data.table)
dcast(melt(setDT(data1),
id.vars = "values")[data2, on = c(value="id")],
values ~ variable, value.var = "place")
给你同样的结果。
更新 2: 针对问题的第二次更新,您可以使用 dplyr / tidyr:
data1 %>%
gather(var, val, c(firstplace,secondplace,thirdplace,fourthplace,fifthplace)) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-val) %>%
spread(var, place)
给出:
levels1 value1 value2 value3 fifthplace firstplace fourthplace secondplace thirdplace
1 name1 4 78 12 NY NY Nantes NY NY
2 name2 3 3000 56 London London Nantes London NY
3 name3 9 90 67 Brest <NA> Nantes London London
4 name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
或者用data.table:
mvars <- c("firstplace","secondplace","thirdplace","fourthplace","fifthplace")
dcast(melt(setDT(data1),
measure.vars = mvars)[data2, on = c(value="id")],
levels1 + value1 + value2 + value3 ~ variable, value.var = "place")
结果相同:
levels1 value1 value2 value3 firstplace secondplace thirdplace fourthplace fifthplace
1: name1 4 78 12 NY NY NY Nantes NY
2: name2 3 3000 56 London London NY Nantes London
3: name3 9 90 67 NA London London Nantes Brest
4: name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
更新3:如果你想使用索引号,你可以这样做:
# dplyr / tidyr
data1 %>%
gather(var, val, c(3,4,6:8)) %>%
left_join(., data2, by = c("val"="id")) %>%
select(-val) %>%
spread(var, place)
# data.table
dcast(melt(setDT(data1),
measure.vars = c(3,4,6:8))[data2, on = c(value="id")],
levels1 + value1 + value2 + value3 ~ variable, value.var = "place")
这给出了(data.table 输出:
levels1 value1 value2 value3 shops after_sales_service provider seller maker
1: name1 4 78 12 NY NY NY Nantes NY
2: name2 3 3000 56 London London NY Nantes London
3: name3 9 90 67 NA London London Nantes Brest
4: name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
我们可以用 base R
merge(merge(data1, data2, by.x='from', by.y= 'id'),
data2, by.x='to', by.y='id')[-(1:2)]
更新
对于您的新数据集,我们可以在将数据集转换为 matrix
后使用 match
,而不使用 'values' 列
temp <- as.data.frame(`dim<-`(as.character(data2$place[
match(as.matrix(data1[-ncol(data1)]), data2$id)]),
dim(data1[-ncol(data1)])))
names(temp) <- head(names(data1),-1)
cbind(data1[ncol(data1)], temp)
# values from to firstplace secondplace thirdplace fourthplace fifthplace
#1 12 NY Nantes NY NY NY Nantes NY
#2 56 London Brest London London NY Nantes London
#3 67 <NA> <NA> <NA> London London Nantes Brest
#4 78 Nantes NY Nantes Nantes Nantes Nantes Nantes
更新2
基于 OP post
中的新更新i1 <- grep('place', names(data1))
d1 <- as.data.frame(`dim<-`(as.character(data2$place[
match(as.matrix(data1[i1]), data2$id)]),
dim(data1[i1])), stringsAsFactors=FALSE)
d2 <- cbind(data1[-i1], setNames(d1, paste0('place', 1:ncol(d1))))
d2
# levels1 value1 value2 value3 place1 place2 place3 place4 place5
#1 name1 4 78 12 NY NY NY Nantes NY
#2 name2 3 3000 56 London London NY Nantes London
#3 name3 9 90 67 <NA> London London Nantes Brest
#4 name4 1 101 78 Nantes Nantes Nantes Nantes Nantes
更新3
如果列名不同,只需更改第2步
d2 <- cbind(data1[-i1], setNames(d1, names(data1[i1])))