使用一个 Dataframe 列的元素到 Select 另一个 Dataframe 中的值以使用 Tidyverse 创建第三个 Dataframe

Using Elements of One Dataframe Column to Select Values in Another Dataframe to Create a Third Dataframe Using Tidyverse

library(tidyverse)

下面提供的代码创建了三个数据框 - Main、LookUp 和 Final。我正在尝试使用 Main 和 LookUp 数据框来创建最终数据框。

例如Finaltable只保留LookUpSection_Lookup中提供的编号"Sections"table,同时也保留相应的"Title" 变量。

我想尽可能多地使用 tidyverse。我的大部分尝试都是按照下面的代码进行的。我在想使用两个循环或 purrr 将允许我在 Main 和 LookUp table 上循环。这比我通常尝试的更先进,所以我想获得一些关于如何继续和处理这种情况的帮助。

New<-map(Main, function(x) {
map(LookUp, function(y) if_else(x$Title1==y$Title_Lookup & ...x$Section1 CONTAINS Y SECTION_LOOKUP... ) )}),

示例代码如下:

主数据框:

    Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B", "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A", "1B", "2A", "2A")
Main<-data_frame(Title1,Section1)

查找Table:

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-c(2, 2, 4, 2, 2, 6, 2)
LookUp<-data_frame(Title_Lookup,Section_Lookup)

最终数据帧:

Section_Final<-c("2A", "2B", "2C", "2A", "4A", "4B", "4C", "4D", "2A", "2B", "2A", "6A", "2A")
Title_Final<-c("101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "200A", "200A", "250D")
Final<-data_frame(Title_Final,Section_Final)

使用. The str_replace function is from , which is part of the . If you just want to load the 包的解决方案,可以使用sub("\D+$", "", Section1)代替str_replace

library(tidyverse)
Main2 <- Main %>%
  mutate(Number = as.numeric(str_replace(Section1, "\D+$", ""))) %>%
  semi_join(LookUp, by = c("Title1" = "Title_Lookup",
                           "Number" = "Section_Lookup")) %>%
  select(Title_Final = Title1,  Section_Final = Section1) %>%
  distinct() 
Main2
# # A tibble: 13 x 2
#    Title_Final Section_Final
#    <chr>       <chr>        
#  1 101A        2A           
#  2 101A        2B           
#  3 101A        2C           
#  4 203S        2A           
#  5 203S        4A           
#  6 203S        4B           
#  7 203S        4C           
#  8 203S        4D           
#  9 400B        2A           
# 10 400B        2B           
# 11 200A        2A           
# 12 200A        6A           
# 13 250D        2A  

这是一个基于sqldf包的解决方案,利用charindex()查看Section_Lookup中的字符串是否出现在Section1中。

library(tidyverse)
Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B", "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B", "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A", "1B", "2A", "2A")
Main<-data_frame(Title1,Section1)

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-as.character(c(2, 2, 4, 2, 2, 6, 2))
LookUp<-data_frame(Title_Lookup,Section_Lookup)

sqlQuery <- "select distinct a.Title1 as Title, a.Section1 as Section 
                    from Main as a
             left join LookUp as b 
             where 
             a.Title1 = b.Title_Lookup and
             charindex(b.Section_Lookup,a.Section1) > 0"
sqldf(sqlQuery)

...和输出。

> sqldf(sqlQuery)
   Title Section
1   101A      2A
2   101A      2B
3   101A      2C
4   203S      2A
5   203S      4A
6   203S      4B
7   203S      4C
8   203S      4D
9   400B      2A
10  400B      2B
11  200A      2A
12  200A      6A
13  250D      2A
>

另一种方法可以基于仅在 Section 列上加入。

library(dplyr)
Name1<-c("Name1", "Name2", "Name3", "Name4", "Name5", "Name6", "Name7", "Name8", "Name9",
         "Name10", "Name11", "Name12", "Name13", "Name14", "Name15", "Name16", "Name17",
         "Name18", "Name19", "Name20", "Name21", "Name22", "Name23", "Name24", "Name25",
         "Name26", "Name27", "Name28", "Name29")
Code<-c(10123, 13432, 34554, 45563, 43666, 54444, 55322, 52111, 33443, 88998, 54554,
        33455, 65889, 88888, 22344, 54455, 66655, 22222, 65564, 77677, 65545, 67765,
        34334, 88789, 76776, 67765, 55555, 65445, 65665)
Title1<-c("101A", "101A", "101A", "101A", "101A", "101A", "203S", "203S", "203S", "203S",
          "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "203S", "400B",
          "400B", "400B", "400B", "200A", "200A", "250D", "250D", "250D", "250D")
Section1<-c("2A", "2A", "2B", "2B", "2B", "2C", "2A", "2A", "4A", "4A", "4A", "4B", "4B",
            "4C", "4C", "4C", "4C", "4D", "4D", "2A", "2A", "2B", "2B", "2A", "6A", "1A",
            "1B", "2A", "2A")
Main<-data_frame(Name1,Code,Title1,Section1)

Title_Lookup<-c("101A", "203S", "203S", "400B", "200A", "200A", "250D")
Section_Lookup<-c(2, 2, 4, 2, 2, 6, 2)
LookUp<-data_frame(Title_Lookup,Section_Lookup)

#create data.frame of distinct Sections
df_sections <- distinct(LookUp, Section_Lookup) %>% as.data.frame()

#Use filter to select those records having matching numeric value in Section
filter(Main, as.numeric(gsub("([0-9]).*","\1",Section1)) %in% df$Section_Lookup) %>%
  select(Title1, Section1) %>% distinct()

#The result:
# A tibble: 13 x 2
   Title1 Section1
    <chr>    <chr>
 1   101A       2A
 2   101A       2B
 3   101A       2C
 4   203S       2A
 5   203S       4A
 6   203S       4B
 7   203S       4C
 8   203S       4D
 9   400B       2A
10   400B       2B
11   200A       2A
12   200A       6A
13   250D       2A