使用一个 Dataframe 列的元素到 Select 另一个 Dataframe 中的值以使用 Tidyverse 创建第三个 Dataframe

Question

library(tidyverse)

下面提供的代码创建了三个数据框 - Main、LookUp 和 Final。我正在尝试使用 Main 和 LookUp 数据框来创建最终数据框。

例如Finaltable只保留LookUpSection_Lookup中提供的编号"Sections"table，同时也保留相应的"Title" 变量。

我想尽可能多地使用 tidyverse。我的大部分尝试都是按照下面的代码进行的。我在想使用两个循环或 purrr 将允许我在 Main 和 LookUp table 上循环。这比我通常尝试的更先进，所以我想获得一些关于如何继续和处理这种情况的帮助。

New<-map(Main, function(x) {
map(LookUp, function(y) if_else(x$Title1==y$Title_Lookup & ...x$Section1 CONTAINS Y SECTION_LOOKUP... ) )}),

示例代码如下：

主数据框：

    Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B", "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A", "1B", "2A", "2A")
Main<-data_frame(Title1,Section1)

查找Table:

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-c(2, 2, 4, 2, 2, 6, 2)
LookUp<-data_frame(Title_Lookup,Section_Lookup)

最终数据帧：

Section_Final<-c("2A", "2B", "2C", "2A", "4A", "4B", "4C", "4D", "2A", "2B", "2A", "6A", "2A")
Title_Final<-c("101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "200A", "200A", "250D")
Final<-data_frame(Title_Final,Section_Final)

Answer 1

使用tidyverse. The str_replace function is from stringr, which is part of the tidyverse. If you just want to load the dplyr包的解决方案，可以使用sub("\D+$", "", Section1)代替str_replace。

library(tidyverse)
Main2 <- Main %>%
  mutate(Number = as.numeric(str_replace(Section1, "\D+$", ""))) %>%
  semi_join(LookUp, by = c("Title1" = "Title_Lookup",
                           "Number" = "Section_Lookup")) %>%
  select(Title_Final = Title1,  Section_Final = Section1) %>%
  distinct() 
Main2
# # A tibble: 13 x 2
#    Title_Final Section_Final
#    <chr>       <chr>        
#  1 101A        2A           
#  2 101A        2B           
#  3 101A        2C           
#  4 203S        2A           
#  5 203S        4A           
#  6 203S        4B           
#  7 203S        4C           
#  8 203S        4D           
#  9 400B        2A           
# 10 400B        2B           
# 11 200A        2A           
# 12 200A        6A           
# 13 250D        2A

Answer 2

这是一个基于sqldf包的解决方案，利用charindex()查看Section_Lookup中的字符串是否出现在Section1中。

library(tidyverse)
Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B", "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A", "1B", "2A", "2A")
Main<-data_frame(Title1,Section1)

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-as.character(c(2, 2, 4, 2, 2, 6, 2))
LookUp<-data_frame(Title_Lookup,Section_Lookup)

sqlQuery <- "select distinct a.Title1 as Title, a.Section1 as Section 
                    from Main as a
             left join LookUp as b 
             where 
             a.Title1 = b.Title_Lookup and
             charindex(b.Section_Lookup,a.Section1) > 0"
sqldf(sqlQuery)

...和输出。

> sqldf(sqlQuery)
   Title Section
1   101A      2A
2   101A      2B
3   101A      2C
4   203S      2A
5   203S      4A
6   203S      4B
7   203S      4C
8   203S      4D
9   400B      2A
10  400B      2B
11  200A      2A
12  200A      6A
13  250D      2A
>

Answer 3

另一种方法可以基于仅在 Section 列上加入。

library(dplyr)
Name1<-c("Name1", "Name2", "Name3", "Name4", "Name5", "Name6", "Name7", "Name8", "Name9",
         "Name10", "Name11", "Name12", "Name13", "Name14", "Name15", "Name16", "Name17",
         "Name18", "Name19", "Name20", "Name21", "Name22", "Name23", "Name24", "Name25",
         "Name26", "Name27", "Name28", "Name29")
Code<-c(10123, 13432, 34554, 45563, 43666, 54444, 55322, 52111, 33443, 88998, 54554,
        33455, 65889, 88888, 22344, 54455, 66655, 22222, 65564, 77677, 65545, 67765,
        34334, 88789, 76776, 67765, 55555, 65445, 65665)
Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S",
          "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B",
          "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B",
            "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A",
            "1B", "2A", "2A")
Main<-data_frame(Name1,Code,Title1,Section1)

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-c(2, 2, 4, 2, 2, 6, 2)
LookUp<-data_frame(Title_Lookup,Section_Lookup)

#create data.frame of distinct Sections
df_sections <- distinct(LookUp, Section_Lookup) %>% as.data.frame()

#Use filter to select those records having matching numeric value in Section
filter(Main, as.numeric(gsub("([0-9]).*","\1",Section1)) %in% df$Section_Lookup) %>%
  select(Title1, Section1) %>% distinct()

#The result:
# A tibble: 13 x 2
   Title1 Section1
    <chr>    <chr>
 1   101A       2A
 2   101A       2B
 3   101A       2C
 4   203S       2A
 5   203S       4A
 6   203S       4B
 7   203S       4C
 8   203S       4D
 9   400B       2A
10   400B       2B
11   200A       2A
12   200A       6A
13   250D       2A

使用一个 Dataframe 列的元素到 Select 另一个 Dataframe 中的值以使用 Tidyverse 创建第三个 Dataframe

Using Elements of One Dataframe Column to Select Values in Another Dataframe to Create a Third Dataframe Using Tidyverse

r

tidyverse