只提取 _ 之后的字母和中间的数字
extract only letters after _ with numbers inbetween
所以我有两列如下所示:
V1 V2
ENSP00000222573_N559D ENSG00000105855
ENSP00000222573_N559D ENSG00000105855
ENSP00000267853_E337* ENSG00000108239
ENSP00000299441_R1672P,R1672G ENSG00000127415
ENSP00000334642_K277N. ENSG00000134324
ENSP00000342952_N585R ENSG00000134324
首先,我需要第一列提取 _ 之后的所有 letters/signs
因此结果需要如下所示:
V1 V2
ND ENSG00000105855
ND ENSG00000105855
E* ENSG00000108239
RP,RG ENSG00000127415
KN ENSG00000134324
NR ENSG00000134324
然后我想这样过滤,只有当V1和V2一起是双倍时,才过滤掉它们。
所以最后的结果是:
V1 V2
ND ENSG00000105855
E* ENSG00000108239
RP,RG ENSG00000127415
KN ENSG00000134324
NR ENSG00000134324
使用 tidyverse 的解决方案。 dat2
是最终输出。
library(tidyverse)
dat2 <- dat %>%
separate(V1, into = c("Re", "V1"), sep = "_") %>%
select(-Re) %>%
mutate(V1 = str_replace_all(V1, "[0-9]*", "")) %>%
distinct(V1, V2, .keep_all = TRUE)
dat2
# V1 V2
# 1 ND ENSG00000105855
# 2 E* ENSG00000108239
# 3 RP,RG ENSG00000127415
# 4 KN. ENSG00000134324
# 5 NR ENSG00000134324
数据
dat <- read.table(text = "V1 V2
ENSP00000222573_N559D ENSG00000105855
ENSP00000222573_N559D ENSG00000105855
'ENSP00000267853_E337*' ENSG00000108239
'ENSP00000299441_R1672P,R1672G' ENSG00000127415
'ENSP00000334642_K277N.' ENSG00000134324
ENSP00000342952_N585R ENSG00000134324",
header = TRUE, stringsAsFactors = FALSE)
一个选项可以使用 sapply
和 strsplit
作为:
sapply(df, function(x){
sapply(strsplit(x, split = "_"), function(y){
if(length(y)<2){
y
} else {
gsub("[0-9]+","",y[2])
}
})
}) %>% as.data.frame() %>% distinct()
# V1 V2
# 1 ND ENSG00000105855
# 2 E* ENSG00000108239
# 3 RP,RG ENSG00000127415
# 4 KN. ENSG00000134324
# 5 NR ENSG00000134324
数据:
df <- read.table(text =
"V1 V2
ENSP00000222573_N559D ENSG00000105855
ENSP00000222573_N559D ENSG00000105855
ENSP00000267853_E337* ENSG00000108239
ENSP00000299441_R1672P,R1672G ENSG00000127415
ENSP00000334642_K277N. ENSG00000134324
ENSP00000342952_N585R ENSG00000134324",
stringsAsFactors = FALSE, header = TRUE)
所以我有两列如下所示:
V1 V2
ENSP00000222573_N559D ENSG00000105855
ENSP00000222573_N559D ENSG00000105855
ENSP00000267853_E337* ENSG00000108239
ENSP00000299441_R1672P,R1672G ENSG00000127415
ENSP00000334642_K277N. ENSG00000134324
ENSP00000342952_N585R ENSG00000134324
首先,我需要第一列提取 _ 之后的所有 letters/signs 因此结果需要如下所示:
V1 V2
ND ENSG00000105855
ND ENSG00000105855
E* ENSG00000108239
RP,RG ENSG00000127415
KN ENSG00000134324
NR ENSG00000134324
然后我想这样过滤,只有当V1和V2一起是双倍时,才过滤掉它们。 所以最后的结果是:
V1 V2
ND ENSG00000105855
E* ENSG00000108239
RP,RG ENSG00000127415
KN ENSG00000134324
NR ENSG00000134324
使用 tidyverse 的解决方案。 dat2
是最终输出。
library(tidyverse)
dat2 <- dat %>%
separate(V1, into = c("Re", "V1"), sep = "_") %>%
select(-Re) %>%
mutate(V1 = str_replace_all(V1, "[0-9]*", "")) %>%
distinct(V1, V2, .keep_all = TRUE)
dat2
# V1 V2
# 1 ND ENSG00000105855
# 2 E* ENSG00000108239
# 3 RP,RG ENSG00000127415
# 4 KN. ENSG00000134324
# 5 NR ENSG00000134324
数据
dat <- read.table(text = "V1 V2
ENSP00000222573_N559D ENSG00000105855
ENSP00000222573_N559D ENSG00000105855
'ENSP00000267853_E337*' ENSG00000108239
'ENSP00000299441_R1672P,R1672G' ENSG00000127415
'ENSP00000334642_K277N.' ENSG00000134324
ENSP00000342952_N585R ENSG00000134324",
header = TRUE, stringsAsFactors = FALSE)
一个选项可以使用 sapply
和 strsplit
作为:
sapply(df, function(x){
sapply(strsplit(x, split = "_"), function(y){
if(length(y)<2){
y
} else {
gsub("[0-9]+","",y[2])
}
})
}) %>% as.data.frame() %>% distinct()
# V1 V2
# 1 ND ENSG00000105855
# 2 E* ENSG00000108239
# 3 RP,RG ENSG00000127415
# 4 KN. ENSG00000134324
# 5 NR ENSG00000134324
数据:
df <- read.table(text =
"V1 V2
ENSP00000222573_N559D ENSG00000105855
ENSP00000222573_N559D ENSG00000105855
ENSP00000267853_E337* ENSG00000108239
ENSP00000299441_R1672P,R1672G ENSG00000127415
ENSP00000334642_K277N. ENSG00000134324
ENSP00000342952_N585R ENSG00000134324",
stringsAsFactors = FALSE, header = TRUE)