如何使用 R 在另一个特定模式之前提取两个特定模式?
How to extract two specific patterns before another specific pattern using R?
我正在尝试提取一个字母(应该是 K 或 Y)以及该字母和模式 (XO44_TMT6)
之间的所有数字,并将提取的值放在两个单独的列中(Mod.residue
和 Mod.position.in.pep
), 但没能得到我想要的。
下面是我的代码和数据框。谁能解释为什么我的代码失败以及如何解决?
非常感谢!
我的数据框:
structure(list(Modifications = c("Y9(XO44_TMT6)", "Y9(XO44_TMT6)",
"Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)",
"Y9(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y7(XO44_TMT6); M9(Oxidation)", "Y7(XO44_TMT6); M8(Oxidation)",
"Y7(XO44_TMT6); M8(Oxidation)", "Y7(XO44_TMT6); C9(Carbamidomethyl); C18(Carbamidomethyl)",
"Y7(XO44_TMT6); C15(Carbamidomethyl)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y6(XO44_TMT6); C23(Carbamidomethyl)", "Y6(XO44_TMT6); C12(Carbamidomethyl)",
"Y6(XO44_TMT6); C12(Carbamidomethyl)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y4(XO44_TMT6); C29(Carbamidomethyl)",
"Y4(XO44_TMT6); C13(Carbamidomethyl)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y3(XO44_TMT6); M5(Oxidation)", "Y3(XO44_TMT6); C11(Carbamidomethyl)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y29(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)",
"Y23(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)",
"Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)",
"Y21(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)",
"Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)",
"Y20(XO44_TMT6)", "Y2(XO44_TMT6); C8(Carbamidomethyl)", "Y2(XO44_TMT6); C19(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)",
"Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y18(XO44_TMT6)",
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)",
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)",
"Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y15(XO44_TMT6); C16(Carbamidomethyl)", "Y15(XO44_TMT6)",
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)",
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)",
"Y14(XO44_TMT6); C15(Carbamidomethyl)", "Y14(XO44_TMT6); C15(Carbamidomethyl)",
"Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y13(XO44_TMT6)",
"Y13(XO44_TMT6)", "Y13(XO44_TMT6)", "Y12(XO44_TMT6); C14(Carbamidomethyl)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)",
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y1(XO44_TMT6); C9(Carbamidomethyl)",
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)",
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)",
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6); C11(Carbamidomethyl)",
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); K4(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)"), Mod.residue = c("9",
"9", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", "8", "8",
"8", "8", "8", "8", "8", "8", "8", "8", "8", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "5", "5", "5", "5", "5", "5", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "3", "3", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "9", "5",
"5", "5", "3", "2", "2", "2", "2", "2", "2", "2", "1", "0", "0",
"0", "0", "0", "0", "0", "0", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8",
"8", "8", "7", "7", "7", "7", "7", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "4", "4", "4", "4", "4", "3", "3", "3", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1",
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "9",
"9", "9", "9", "8", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "4", "8", "8", "8", "8"
), Mod.position.in.pep = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "")), row.names = c(NA, -300L), class = "data.frame")
我的代码:
df <- df.test %>%
mutate(Mod.residue = gsub(".*(\w{1})\d*\(XO44_TMT6)\;*\s*.*", "\1", Modifications),
Mod.position.in.pep = gsub(".*\w{1}(\d*)\(XO44_TMT6\)\;*\s*.*", "\1", Modifications)
)
我认为您正在寻找 tidyr::extract
,它非常适合您在单个函数调用中的目的。
library(tidyr)
output_extract<-df %>%
extract(Modifications,
into = c('Mod.residue', 'Mod.position.in.pep'),
regex = ".*([A-Z])(\d+)(?=\(XO44_TMT6\)).*",
remove=FALSE)
如果你想继续使用gsub
,你可以这样做(相同的模式,两个不同的替换(\1
和\2
):
output_gsub<-df %>% mutate(Mod.residue=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\1", Modifications),
Mod.position.in.pep=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\2", Modifications))
这些方法给出相同的输出:
identical(output_extract, output_gsub)
[1] TRUE
您可能希望将“Mod.position.in.pep”变量转换为数字,之后使用 as.numeric
。
输出列的唯一值:
$Mod.residue
[1] "Y" "K"
$Mod.position.in.pep
[1] "9" "8" "7" "6" "5" "4" "3" "29" "25" "23" "22" "21" "20" "2" "19" "18" "17" "16" "15" "14" "13" "12" "11" "10" "1"
编辑
这仅在“Mod.residue”或“Mod.position.in.pep”只有一个匹配项时有效。
如果每次观察有多个“[KY]digit(XO44_TMT6)
”,您可能需要采取更复杂的方法,mutate %>% unnest_wider %>% unite
#Example data
df<-tibble(Modifications="K4(XO44_TMT6); Y6(XO44_TMT6)")
#solution
library(dplyr)
library(tidyr)
library(stringr)
df %>% mutate(Mod.residue=str_extract_all(Modifications, "[A-Z]+(?=\d+\(XO44_TMT6\))"),
Mod.position.in.pep=str_extract_all(Modifications, "\d+(?=\(XO44_TMT6\))"))%>%
unnest_wider(col='Mod.residue', names_sep = "_")%>%
unnest_wider(col='Mod.position.in.pep', names_sep = "_")%>%
unite(starts_with('Mod.residue'), col="Mod.residue", sep = ';', remove=TRUE, na.rm=TRUE)%>%
unite(starts_with('Mod.position'), col='Mod.position.in.pep', sep=';', remove=TRUE, na.rm=TRUE)
输出
# A tibble: 1 x 3
Modifications Mod.residue Mod.position.in.pep
<chr> <chr> <chr>
1 K4(XO44_TMT6); Y6(XO44_TMT6) K;Y 4;6
我会首先创建一个列,其中包含整个残留物。位置被提取。然后就可以根据这一列把名字和位置分开,然后把中间结果去掉:
df.test %>%
mutate(
residue = Modifications %>% str_extract("[KY][0-9]+"),
Mod.residue = residue %>% str_extract("^[KY]"),
Mod.position.in.pep = residue %>% str_extract("[0-9]") %>% as.numeric(),
) %>%
select(-residue)
str_extract
旨在与其他 tidyverse 函数一起使用。
我正在尝试提取一个字母(应该是 K 或 Y)以及该字母和模式 (XO44_TMT6)
之间的所有数字,并将提取的值放在两个单独的列中(Mod.residue
和 Mod.position.in.pep
), 但没能得到我想要的。
下面是我的代码和数据框。谁能解释为什么我的代码失败以及如何解决?
非常感谢!
我的数据框:
structure(list(Modifications = c("Y9(XO44_TMT6)", "Y9(XO44_TMT6)",
"Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)",
"Y9(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y7(XO44_TMT6); M9(Oxidation)", "Y7(XO44_TMT6); M8(Oxidation)",
"Y7(XO44_TMT6); M8(Oxidation)", "Y7(XO44_TMT6); C9(Carbamidomethyl); C18(Carbamidomethyl)",
"Y7(XO44_TMT6); C15(Carbamidomethyl)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y6(XO44_TMT6); C23(Carbamidomethyl)", "Y6(XO44_TMT6); C12(Carbamidomethyl)",
"Y6(XO44_TMT6); C12(Carbamidomethyl)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y4(XO44_TMT6); C29(Carbamidomethyl)",
"Y4(XO44_TMT6); C13(Carbamidomethyl)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y3(XO44_TMT6); M5(Oxidation)", "Y3(XO44_TMT6); C11(Carbamidomethyl)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y29(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)",
"Y23(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)",
"Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)",
"Y21(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)",
"Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)",
"Y20(XO44_TMT6)", "Y2(XO44_TMT6); C8(Carbamidomethyl)", "Y2(XO44_TMT6); C19(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)",
"Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y18(XO44_TMT6)",
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)",
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)",
"Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y15(XO44_TMT6); C16(Carbamidomethyl)", "Y15(XO44_TMT6)",
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)",
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)",
"Y14(XO44_TMT6); C15(Carbamidomethyl)", "Y14(XO44_TMT6); C15(Carbamidomethyl)",
"Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y13(XO44_TMT6)",
"Y13(XO44_TMT6)", "Y13(XO44_TMT6)", "Y12(XO44_TMT6); C14(Carbamidomethyl)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)",
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y1(XO44_TMT6); C9(Carbamidomethyl)",
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)",
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)",
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6); C11(Carbamidomethyl)",
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); K4(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)"), Mod.residue = c("9",
"9", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", "8", "8",
"8", "8", "8", "8", "8", "8", "8", "8", "8", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "5", "5", "5", "5", "5", "5", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "3", "3", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "9", "5",
"5", "5", "3", "2", "2", "2", "2", "2", "2", "2", "1", "0", "0",
"0", "0", "0", "0", "0", "0", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8",
"8", "8", "7", "7", "7", "7", "7", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "4", "4", "4", "4", "4", "3", "3", "3", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1",
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "9",
"9", "9", "9", "8", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "4", "8", "8", "8", "8"
), Mod.position.in.pep = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "")), row.names = c(NA, -300L), class = "data.frame")
我的代码:
df <- df.test %>%
mutate(Mod.residue = gsub(".*(\w{1})\d*\(XO44_TMT6)\;*\s*.*", "\1", Modifications),
Mod.position.in.pep = gsub(".*\w{1}(\d*)\(XO44_TMT6\)\;*\s*.*", "\1", Modifications)
)
我认为您正在寻找 tidyr::extract
,它非常适合您在单个函数调用中的目的。
library(tidyr)
output_extract<-df %>%
extract(Modifications,
into = c('Mod.residue', 'Mod.position.in.pep'),
regex = ".*([A-Z])(\d+)(?=\(XO44_TMT6\)).*",
remove=FALSE)
如果你想继续使用gsub
,你可以这样做(相同的模式,两个不同的替换(\1
和\2
):
output_gsub<-df %>% mutate(Mod.residue=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\1", Modifications),
Mod.position.in.pep=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\2", Modifications))
这些方法给出相同的输出:
identical(output_extract, output_gsub)
[1] TRUE
您可能希望将“Mod.position.in.pep”变量转换为数字,之后使用 as.numeric
。
输出列的唯一值:
$Mod.residue
[1] "Y" "K"
$Mod.position.in.pep
[1] "9" "8" "7" "6" "5" "4" "3" "29" "25" "23" "22" "21" "20" "2" "19" "18" "17" "16" "15" "14" "13" "12" "11" "10" "1"
编辑
这仅在“Mod.residue”或“Mod.position.in.pep”只有一个匹配项时有效。
如果每次观察有多个“[KY]digit(XO44_TMT6)
”,您可能需要采取更复杂的方法,mutate %>% unnest_wider %>% unite
#Example data
df<-tibble(Modifications="K4(XO44_TMT6); Y6(XO44_TMT6)")
#solution
library(dplyr)
library(tidyr)
library(stringr)
df %>% mutate(Mod.residue=str_extract_all(Modifications, "[A-Z]+(?=\d+\(XO44_TMT6\))"),
Mod.position.in.pep=str_extract_all(Modifications, "\d+(?=\(XO44_TMT6\))"))%>%
unnest_wider(col='Mod.residue', names_sep = "_")%>%
unnest_wider(col='Mod.position.in.pep', names_sep = "_")%>%
unite(starts_with('Mod.residue'), col="Mod.residue", sep = ';', remove=TRUE, na.rm=TRUE)%>%
unite(starts_with('Mod.position'), col='Mod.position.in.pep', sep=';', remove=TRUE, na.rm=TRUE)
输出
# A tibble: 1 x 3
Modifications Mod.residue Mod.position.in.pep
<chr> <chr> <chr>
1 K4(XO44_TMT6); Y6(XO44_TMT6) K;Y 4;6
我会首先创建一个列,其中包含整个残留物。位置被提取。然后就可以根据这一列把名字和位置分开,然后把中间结果去掉:
df.test %>%
mutate(
residue = Modifications %>% str_extract("[KY][0-9]+"),
Mod.residue = residue %>% str_extract("^[KY]"),
Mod.position.in.pep = residue %>% str_extract("[0-9]") %>% as.numeric(),
) %>%
select(-residue)
str_extract
旨在与其他 tidyverse 函数一起使用。