使用混合数据连接多个表

Question

我的数据：

data1 <- data.frame(from = c(1, 2, 13, 4),
                    to = c(4, 3, 9, 1),
                    values = c(12, 56, 67, 78)) 

data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
                    id = c(1, 2, 3, 4))

我的结果：

from to values
   1  4     12
   2  3     56
  13  9     67
   4  1     78

  place id
     NY  1
 London  2
  Brest  3
 Nantes  4

我期望使用 dplyr 包中的 join 函数（在新 table 中）

from      to      values
NY     Nantes     12
London  Brest     56
London     NY     78

我尝试了什么：

 data3<- inner_join (data1, data2, by =c("from" = "id", "to" = "id"))
 data3

一些参考资料：
https://stat545-ubc.github.io/bit001_dplyr-cheatsheet.html
https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html

一个更大的例子混合数据

假设我有 50 列包含地理数据 ("places") 和非地理数据（级别、值）
我不想更改 d.f.
的列顺序我想保留我的列名

   data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
                       value1 = c(4, 3, 9, 1),
                       firstplace = c(1, 2, 13, 4),  
                       secondplace = c(1, 2, 2, 4),
                       value2  = c(78, 3000, 90, 101),
                       thirdplace =c(1, 1, 2, 4),
                       fourthplace=c(4, 4, 4, 4),
                       fifthplace=c(1, 2, 3, 4), 
                       value3 = c(12, 56, 67, 78))

   data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
                    id = c(1, 2, 3, 4))

一个不同名称的例子（更复杂？）

我不想更改我的 d.f 的列顺序。
我想保留我的列名

   data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
                       value1 = c(4, 3, 9, 1),
                       shops= c(1, 2, 13, 4),  
                       after_sales_service = c(1, 2, 2, 4),
                       value2  = c(78, 3000, 90, 101),
                       provider =c(1, 1, 2, 4),
                       seller=c(4, 4, 4, 4),
                       maker=c(1, 2, 3, 4), 
                       value3 = c(12, 56, 67, 78))

   data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
                    id = c(1, 2, 3, 4))

Answer 1

您可以使用 data2 作为查询来代替加入 table:

library(dplyr)
data1 <- data1 %>% 
  mutate(from = data2$place[match(from, data2$id)],
         to = data2$place[match(to, data2$id)]) %>%
  filter(complete.cases(.))

给出：

> data1
    from     to values
1     NY Nantes     12
2 London  Brest     56
3 Nantes     NY     78

data.table 包的替代解决方案：

library(data.table)
na.omit(setDT(data1)[, `:=` (from = data2$place[match(from, data2$id)],
                             to = data2$place[match(to, data2$id)])])

你也可以做一个双 left_join:

data1 %>% 
  left_join(., data2, by = c("from"="id")) %>%
  left_join(., data2, by = c("to"="id")) %>%
  select(-c(1:2)) %>%
  filter(complete.cases(.))

更新 1： 如果您有多个名称必须匹配的列，最好先将数据框转换为长格式。一个更大数据集的例子：

library(dplyr)
library(tidyr)
data1 %>%
  gather(var, val, -values) %>%
  left_join(., data2, by = c("val"="id")) %>%
  select(-3) %>%
  filter(!is.na(place)) %>% 
  spread(var, place)

给出：

  values fifthplace firstplace fourthplace   from secondplace thirdplace     to
1     12         NY         NY      Nantes     NY          NY         NY Nantes
2     56     London     London      Nantes London      London         NY  Brest
3     67      Brest       <NA>      Nantes   <NA>      London     London   <NA>
4     78     Nantes     Nantes      Nantes Nantes      Nantes     Nantes     NY

使用 data.table 你可以：

library(data.table)
dcast(melt(setDT(data1),
           id.vars = "values")[data2, on = c(value="id")],
      values ~ variable, value.var = "place")

给你同样的结果。

更新 2: 针对问题的第二次更新，您可以使用 dplyr / tidyr:

data1 %>%
  gather(var, val, c(firstplace,secondplace,thirdplace,fourthplace,fifthplace)) %>%
  left_join(., data2, by = c("val"="id")) %>%
  select(-val) %>%
  spread(var, place)

给出：

  levels1 value1 value2 value3 fifthplace firstplace fourthplace secondplace thirdplace
1   name1      4     78     12         NY         NY      Nantes          NY         NY
2   name2      3   3000     56     London     London      Nantes      London         NY
3   name3      9     90     67      Brest       <NA>      Nantes      London     London
4   name4      1    101     78     Nantes     Nantes      Nantes      Nantes     Nantes

或者用data.table:

mvars <- c("firstplace","secondplace","thirdplace","fourthplace","fifthplace")
dcast(melt(setDT(data1),
           measure.vars = mvars)[data2, on = c(value="id")],
      levels1 + value1 + value2 + value3 ~ variable, value.var = "place")

结果相同：

   levels1 value1 value2 value3 firstplace secondplace thirdplace fourthplace fifthplace
1:   name1      4     78     12         NY          NY         NY      Nantes         NY
2:   name2      3   3000     56     London      London         NY      Nantes     London
3:   name3      9     90     67         NA      London     London      Nantes      Brest
4:   name4      1    101     78     Nantes      Nantes     Nantes      Nantes     Nantes

更新3：如果你想使用索引号，你可以这样做：

# dplyr / tidyr
data1 %>%
  gather(var, val, c(3,4,6:8)) %>%
  left_join(., data2, by = c("val"="id")) %>%
  select(-val) %>%
  spread(var, place)

# data.table
dcast(melt(setDT(data1),
           measure.vars = c(3,4,6:8))[data2, on = c(value="id")],
      levels1 + value1 + value2 + value3 ~ variable, value.var = "place")

这给出了（data.table 输出：

   levels1 value1 value2 value3  shops after_sales_service provider seller  maker
1:   name1      4     78     12     NY                  NY       NY Nantes     NY
2:   name2      3   3000     56 London              London       NY Nantes London
3:   name3      9     90     67     NA              London   London Nantes  Brest
4:   name4      1    101     78 Nantes              Nantes   Nantes Nantes Nantes

Answer 2

我们可以用 base R

merge(merge(data1, data2, by.x='from', by.y= 'id'),
           data2, by.x='to', by.y='id')[-(1:2)]

更新

对于您的新数据集，我们可以在将数据集转换为 matrix 后使用 match，而不使用 'values' 列

temp <- as.data.frame(`dim<-`(as.character(data2$place[
          match(as.matrix(data1[-ncol(data1)]),  data2$id)]),
                        dim(data1[-ncol(data1)])))
names(temp) <- head(names(data1),-1)
cbind(data1[ncol(data1)], temp)
#  values   from     to firstplace secondplace thirdplace fourthplace fifthplace
#1     12     NY Nantes         NY          NY         NY      Nantes         NY
#2     56 London  Brest     London      London         NY      Nantes     London
#3     67   <NA>   <NA>       <NA>      London     London      Nantes      Brest
#4     78 Nantes     NY     Nantes      Nantes     Nantes      Nantes     Nantes

更新2

基于 OP post

中的新更新

i1 <- grep('place', names(data1))
d1 <- as.data.frame(`dim<-`(as.character(data2$place[
        match(as.matrix(data1[i1]), data2$id)]), 
          dim(data1[i1])), stringsAsFactors=FALSE)
d2 <- cbind(data1[-i1], setNames(d1, paste0('place', 1:ncol(d1))))
d2
#   levels1 value1 value2 value3 place1 place2 place3 place4 place5
#1   name1      4     78     12     NY     NY     NY Nantes     NY
#2   name2      3   3000     56 London London     NY Nantes London
#3   name3      9     90     67   <NA> London London Nantes  Brest
#4   name4      1    101     78 Nantes Nantes Nantes Nantes Nantes

更新3

如果列名不同，只需更改第2步

 d2 <- cbind(data1[-i1], setNames(d1, names(data1[i1])))

使用混合数据连接多个表

Joining multiple tables with mix data

r

sqldf

dplyr

更新

更新2

更新3