如何根据 TOOL 列中的不同值在 ITEM 上使用多个分隔符?
How to use multiple delimiters on ITEM based on different values in TOOL column?
我有一个这样的数据框
ID <- c("A","A","A","A","A","A","A","A","A")
TOOL <- c("ABC_01","ABC_04","ABC_02",
"DEF_01","DEF_05","DEF_02",
"IJK_04","IJK_01","IJK_02")
ITEM <- c("RFALT.2SHEET.S13","RFTACU_789.L_PM.2_N13","CS20_789VIS.200_L_PM.STD",
"ACRF55_16_T37\AT_PM3\1 N1","RFALT\2SHEET\S13","RFNF_1_2U\L_PM3\5 N13",
"RFALT 2SHEET S13","CS20_STD 2DUB_L SP C9","RFNC_DBS_T2000 L_EDGE 1 N13")
df <- data.frame(ID,TOOL,ITEM,stringsAsFactors = F)
我正在尝试使用基于 TOOL
s 列的多个定界符
- 如果 TOOL LIKE 'ABC',则提取
ITEM
中的字符串,直到 .
(. 是定界符)
- 如果TOOL LIKE 'DEF',则提取
ITEM
中的字符串直到\
(\是分隔符)
- 如果TOOL LIKE 'IJK',则提取
ITEM
中的字符串直到</code>(space是分隔符)</li>
</ol>
<p><strong>期望输出</strong></p>
<pre><code> ID TOOL ITEM
A ABC_01 RFALT
A ABC_04 RFTACU_789
A ABC_02 CS20_789VIS
A DEF_01 ACRF55_16_T37
A DEF_05 RFALT
A DEF_02 RFNF_1_2U
A IJK_04 RFALT
A IJK_01 CS20_STD
A IJK_02 RFNC_DBS_T2000
我正在尝试使用 dplyr
& stringr
包
library(dplyr)
library(stringr)
df2 <- df %>%
filter(grepl("ABC",TOOL)) %>%
mutate(ITEM = str_extract(ITEM, "^.*(?=.\)")) %>%
filter(grepl("DEF",TOOL)) %>%
mutate(ITEM = str_extract(ITEM, "^.*(?=.\.)"))
这行不通。有人能给我指出正确的方向吗?
我确信有一种更有效的方法可以使用正则表达式执行此操作,但直接的方法是只使用 strsplit
和 select 第一个拆分
library(dplyr)
df %>%
mutate(ITEM2 = ITEM %>% strsplit('\.| |\\') %>% sapply(`[[`, 1))
# ID TOOL ITEM ITEM2
# 1 A ABC_01 RFALT.2SHEET.S13 RFALT
# 2 A ABC_04 RFTACU_789.L_PM.2_N13 RFTACU_789
# 3 A ABC_02 CS20_789VIS.200_L_PM.STD CS20_789VIS
# 4 A DEF_01 ACRF55_16_T37\AT_PM3\1 N1 ACRF55_16_T37
# 5 A DEF_05 RFALT\2SHEET\S13 RFALT
# 6 A DEF_02 RFNF_1_2U\L_PM3\5 N13 RFNF_1_2U
# 7 A IJK_04 RFALT 2SHEET S13 RFALT
# 8 A IJK_01 CS20_STD 2DUB_L SP C9 CS20_STD
# 9 A IJK_02 RFNC_DBS_T2000 L_EDGE 1 N13 RFNC_DBS_T2000
一个选项 str_remove
library(tidyverse)
df %>%
mutate(ITEM = str_remove(ITEM, "[.\\ ].*"))
# ID TOOL ITEM
#1 A ABC_01 RFALT
#2 A ABC_04 RFTACU_789
#3 A ABC_02 CS20_789VIS
#4 A DEF_01 ACRF55_16_T37
#5 A DEF_05 RFALT
#6 A DEF_02 RFNF_1_2U
#7 A IJK_04 RFALT
#8 A IJK_01 CS20_STD
#9 A IJK_02 RFNC_DBS_T2000
如果模式特定于特定 'TOOL',一种选择是单独应用 str_remove
map2_df(c("ABC", "DEF", "IJK"), c(".", "\\", " "), ~
df %>%
filter(str_detect(TOOL, .x)) %>%
mutate(ITEM = str_remove(ITEM, paste0("[", .y, "].*"))))
# ID TOOL ITEM
#1 A ABC_01 RFALT
#2 A ABC_04 RFTACU_789
#3 A ABC_02 CS20_789VIS
#4 A DEF_01 ACRF55_16_T37
#5 A DEF_05 RFALT
#6 A DEF_02 RFNF_1_2U
#7 A IJK_04 RFALT
#8 A IJK_01 CS20_STD
#9 A IJK_02 RFNC_DBS_T2000
我们可以使用 case_when
和 str_replace
。
library(tidyverse)
df2 <- df %>%
mutate(ITEM = case_when(
str_detect(TOOL, "^ABC") ~str_replace(ITEM, "[\.].*", ""),
str_detect(TOOL, "^DEF") ~str_replace(ITEM, "[\\].*", ""),
str_detect(TOOL, "^IJK") ~str_replace(ITEM, "[:space:].*", "")
))
df2
# ID TOOL ITEM
# 1 A ABC_01 RFALT
# 2 A ABC_04 RFTACU_789
# 3 A ABC_02 CS20_789VIS
# 4 A DEF_01 ACRF55_16_T37
# 5 A DEF_05 RFALT
# 6 A DEF_02 RFNF_1_2U
# 7 A IJK_04 RFALT
# 8 A IJK_01 CS20_STD
# 9 A IJK_02 RFNC_DBS_T2000
我有一个这样的数据框
ID <- c("A","A","A","A","A","A","A","A","A")
TOOL <- c("ABC_01","ABC_04","ABC_02",
"DEF_01","DEF_05","DEF_02",
"IJK_04","IJK_01","IJK_02")
ITEM <- c("RFALT.2SHEET.S13","RFTACU_789.L_PM.2_N13","CS20_789VIS.200_L_PM.STD",
"ACRF55_16_T37\AT_PM3\1 N1","RFALT\2SHEET\S13","RFNF_1_2U\L_PM3\5 N13",
"RFALT 2SHEET S13","CS20_STD 2DUB_L SP C9","RFNC_DBS_T2000 L_EDGE 1 N13")
df <- data.frame(ID,TOOL,ITEM,stringsAsFactors = F)
我正在尝试使用基于 TOOL
s 列的多个定界符
- 如果 TOOL LIKE 'ABC',则提取
ITEM
中的字符串,直到.
(. 是定界符) - 如果TOOL LIKE 'DEF',则提取
ITEM
中的字符串直到\
(\是分隔符) - 如果TOOL LIKE 'IJK',则提取
ITEM
中的字符串直到</code>(space是分隔符)</li> </ol> <p><strong>期望输出</strong></p> <pre><code> ID TOOL ITEM A ABC_01 RFALT A ABC_04 RFTACU_789 A ABC_02 CS20_789VIS A DEF_01 ACRF55_16_T37 A DEF_05 RFALT A DEF_02 RFNF_1_2U A IJK_04 RFALT A IJK_01 CS20_STD A IJK_02 RFNC_DBS_T2000
我正在尝试使用
dplyr
&stringr
包library(dplyr) library(stringr) df2 <- df %>% filter(grepl("ABC",TOOL)) %>% mutate(ITEM = str_extract(ITEM, "^.*(?=.\)")) %>% filter(grepl("DEF",TOOL)) %>% mutate(ITEM = str_extract(ITEM, "^.*(?=.\.)"))
这行不通。有人能给我指出正确的方向吗?
我确信有一种更有效的方法可以使用正则表达式执行此操作,但直接的方法是只使用 strsplit
和 select 第一个拆分
library(dplyr)
df %>%
mutate(ITEM2 = ITEM %>% strsplit('\.| |\\') %>% sapply(`[[`, 1))
# ID TOOL ITEM ITEM2
# 1 A ABC_01 RFALT.2SHEET.S13 RFALT
# 2 A ABC_04 RFTACU_789.L_PM.2_N13 RFTACU_789
# 3 A ABC_02 CS20_789VIS.200_L_PM.STD CS20_789VIS
# 4 A DEF_01 ACRF55_16_T37\AT_PM3\1 N1 ACRF55_16_T37
# 5 A DEF_05 RFALT\2SHEET\S13 RFALT
# 6 A DEF_02 RFNF_1_2U\L_PM3\5 N13 RFNF_1_2U
# 7 A IJK_04 RFALT 2SHEET S13 RFALT
# 8 A IJK_01 CS20_STD 2DUB_L SP C9 CS20_STD
# 9 A IJK_02 RFNC_DBS_T2000 L_EDGE 1 N13 RFNC_DBS_T2000
一个选项 str_remove
library(tidyverse)
df %>%
mutate(ITEM = str_remove(ITEM, "[.\\ ].*"))
# ID TOOL ITEM
#1 A ABC_01 RFALT
#2 A ABC_04 RFTACU_789
#3 A ABC_02 CS20_789VIS
#4 A DEF_01 ACRF55_16_T37
#5 A DEF_05 RFALT
#6 A DEF_02 RFNF_1_2U
#7 A IJK_04 RFALT
#8 A IJK_01 CS20_STD
#9 A IJK_02 RFNC_DBS_T2000
如果模式特定于特定 'TOOL',一种选择是单独应用 str_remove
map2_df(c("ABC", "DEF", "IJK"), c(".", "\\", " "), ~
df %>%
filter(str_detect(TOOL, .x)) %>%
mutate(ITEM = str_remove(ITEM, paste0("[", .y, "].*"))))
# ID TOOL ITEM
#1 A ABC_01 RFALT
#2 A ABC_04 RFTACU_789
#3 A ABC_02 CS20_789VIS
#4 A DEF_01 ACRF55_16_T37
#5 A DEF_05 RFALT
#6 A DEF_02 RFNF_1_2U
#7 A IJK_04 RFALT
#8 A IJK_01 CS20_STD
#9 A IJK_02 RFNC_DBS_T2000
我们可以使用 case_when
和 str_replace
。
library(tidyverse)
df2 <- df %>%
mutate(ITEM = case_when(
str_detect(TOOL, "^ABC") ~str_replace(ITEM, "[\.].*", ""),
str_detect(TOOL, "^DEF") ~str_replace(ITEM, "[\\].*", ""),
str_detect(TOOL, "^IJK") ~str_replace(ITEM, "[:space:].*", "")
))
df2
# ID TOOL ITEM
# 1 A ABC_01 RFALT
# 2 A ABC_04 RFTACU_789
# 3 A ABC_02 CS20_789VIS
# 4 A DEF_01 ACRF55_16_T37
# 5 A DEF_05 RFALT
# 6 A DEF_02 RFNF_1_2U
# 7 A IJK_04 RFALT
# 8 A IJK_01 CS20_STD
# 9 A IJK_02 RFNC_DBS_T2000