r - 正则表达式的最后一次匹配
r - Last match of regular expression
我正在寻找一个 R
模式匹配表达式,用于提取列表中每个元素中最后一个完全填充的分类法。分类法始终具有相同的格式(一个字母两个下划线和一个单词(有时在方括号内)。未完全填充的分类法在两个下划线后没有单词。
我能够构建一个在一个正则表达式构建器网站上工作的表达式
(.\_\_[A-Za-z\[\]]+)(?!.*__[A-Za-z\[\])
但是我没有运气使用它或将它转换为使用 R
grep {base}
或任何类似的模式匹配方法。这是我尝试过的事情之一
clean=gsub("(.\_\_[A-Za-z[]]+)(?!.*__[A-Za-z[]])","\1",taxonomies,perl = TRUE)
有什么建议吗?
谢谢!
taxonomies=
list('k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__MB-A2-108; o__0319-7L14; f__; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales;f__Corynebacteriaceae; g__Corynebacterium; s__'
,'k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Rhodocyclales; f__Rhodocyclaceae; g__Methyloversatilis; s__'
,'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Myxococcales; f__; g__; s__'
,'k__Bacteria; p__Proteobacteria; c__[Deltaproteobacteria]; o__[W123]; f__[W123]; g__[W123]; s__[W123.012.123]'
,'k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae')
期望的输出
[1] "f__Chitinophagaceae" "o__0319-7L14" "g__Corynebacterium"
[4] "g__Methyloversatilis" "o__Myxococcales" "s__[W123.012.123]"
[7] "f__Chitinophagaceae"
编辑
包括所需的输出,示例代码 gsub 不起作用。
我们可以使用 stri_extract_last
从 stringi
library(stringi)
stri_extract_last(unlist(taxonomies), regex = '[A-Za-z]__\[*[[:alnum:].-]+\]*')
#[1] "f__Chitinophagaceae" "o__0319-7L14" "g__Corynebacterium"
#[4] "g__Methyloversatilis" "o__Myxococcales" "s__[W123.012.123]"
#[7] "f__Chitinophagaceae"
在这里,我假设 OP 的意思是提取 **...**
中的字符。它一定是一些格式问题,因为它没有显示在 BOLD.
中
数据
taxonomies=list(
'k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__MB-A2-108; o__0319-7L14; f__; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales;f__Corynebacteriaceae; g__Corynebacterium; s__'
,'k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Rhodocyclales; f__Rhodocyclaceae; g__Methyloversatilis; s__'
,'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Myxococcales; f__; g__; s__'
,'k__Bacteria; p__Proteobacteria; c__[Deltaproteobacteria]; o__[W123]; f__[W123]; g__[W123]; s__[W123.012.123]'
,'k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae'
)
我正在寻找一个 R
模式匹配表达式,用于提取列表中每个元素中最后一个完全填充的分类法。分类法始终具有相同的格式(一个字母两个下划线和一个单词(有时在方括号内)。未完全填充的分类法在两个下划线后没有单词。
我能够构建一个在一个正则表达式构建器网站上工作的表达式
(.\_\_[A-Za-z\[\]]+)(?!.*__[A-Za-z\[\])
但是我没有运气使用它或将它转换为使用 R
grep {base}
或任何类似的模式匹配方法。这是我尝试过的事情之一
clean=gsub("(.\_\_[A-Za-z[]]+)(?!.*__[A-Za-z[]])","\1",taxonomies,perl = TRUE)
有什么建议吗? 谢谢!
taxonomies=
list('k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__MB-A2-108; o__0319-7L14; f__; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales;f__Corynebacteriaceae; g__Corynebacterium; s__'
,'k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Rhodocyclales; f__Rhodocyclaceae; g__Methyloversatilis; s__'
,'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Myxococcales; f__; g__; s__'
,'k__Bacteria; p__Proteobacteria; c__[Deltaproteobacteria]; o__[W123]; f__[W123]; g__[W123]; s__[W123.012.123]'
,'k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae')
期望的输出
[1] "f__Chitinophagaceae" "o__0319-7L14" "g__Corynebacterium"
[4] "g__Methyloversatilis" "o__Myxococcales" "s__[W123.012.123]"
[7] "f__Chitinophagaceae"
编辑 包括所需的输出,示例代码 gsub 不起作用。
我们可以使用 stri_extract_last
从 stringi
library(stringi)
stri_extract_last(unlist(taxonomies), regex = '[A-Za-z]__\[*[[:alnum:].-]+\]*')
#[1] "f__Chitinophagaceae" "o__0319-7L14" "g__Corynebacterium"
#[4] "g__Methyloversatilis" "o__Myxococcales" "s__[W123.012.123]"
#[7] "f__Chitinophagaceae"
在这里,我假设 OP 的意思是提取 **...**
中的字符。它一定是一些格式问题,因为它没有显示在 BOLD.
数据
taxonomies=list(
'k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__MB-A2-108; o__0319-7L14; f__; g__; s__'
,'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales;f__Corynebacteriaceae; g__Corynebacterium; s__'
,'k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Rhodocyclales; f__Rhodocyclaceae; g__Methyloversatilis; s__'
,'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Myxococcales; f__; g__; s__'
,'k__Bacteria; p__Proteobacteria; c__[Deltaproteobacteria]; o__[W123]; f__[W123]; g__[W123]; s__[W123.012.123]'
,'k__Bacteria; p__Bacteroidetes; c__[Saprospirae]; o__[Saprospirales]; f__Chitinophagaceae'
)