如何在一列中分隔 3 个不同的信息?
How can I separate 3 different information in a column?
比如我的那一栏,有一行写着'Ser25Phe'
。我想拆分列 HGVS.Consequence
例如作为 'Ser 25 Phe'
...
HGVS.Consequence
Met1?
Met1?
Met1?
Ala2Glu
Ala2Ala
Asn3Asp
Asn3Asn
Gly4Trp
Gly4Arg
Ala6Glu
AsAsp
Arg9Arg
Lys10Arg
Lys10Lys
LeullLeu
Phe12Ser
Phe12Cys
lle13Leu
lle13Val
lle13Phe
Thr15Pro
使用gsub
,假设例如"AsAsp"
也应该拆分成 "As Asp"
.
trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |>
gsub(pat=' ', rep=' ') ## optional, to remove inner double whitespace
# [1] "Met 1 ?" "Met 1 ?" "Met 1 ?" "Ala 2 Glu"
# [5] "Ala 2 Ala" "Asn 3 Asp" "Asn 3 Asn" "Gly 4 Trp"
# [9] "Gly 4 Arg" "Ala 6 Glu" "As Asp" "Arg 9 Arg"
# [13] "Lys 10 Arg" "Lys 10 Lys" "Leull Leu" "Phe 12 Ser"
# [17] "Phe 12 Cys" "lle 13 Leu" "lle 13 Val" "lle 13 Phe"
# [21] "Thr 15 Pro"
编辑
如果您的列位于这样的数据框中
df <- data.frame(x1=rnorm(21), x2=runif(21), x3=x)
只需将其包裹在 transform
:
df |>
transform(x3=trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |>
gsub(pat=' ', rep=' '))
或者,可能更好,
res <- trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |>
strsplit(' ') |>
do.call(what=rbind) |>
{\(.) replace(., . %in% c('', '?'), NA)}() |>
data.frame(df[1:2]) |>
type.convert(as.is=TRUE)
res
# X1 X2 X3 x1 x2
# 1 Met 1 <NA> 1.33312448 0.83441710
# 2 Met 1 <NA> -0.55792615 0.48805921
# 3 Met 1 <NA> 1.38184166 0.73862824
# 4 Ala 2 Glu -0.87990439 0.42793122
# 5 Ala 2 Ala 0.59143575 0.23370509
# 6 Asn 3 Asp -0.15065801 0.92168932
# 7 Asn 3 Asn -1.59350802 0.58727950
# 8 Gly 4 Trp -0.21971055 0.69603185
# 9 Gly 4 Arg -0.14004599 0.36722717
# 10 Ala 6 Glu 0.31747188 0.54845522
# 11 As NA Asp -0.07593689 0.41273905
# 12 Arg 9 Arg -0.54154181 0.12890089
# 13 Lys 10 Arg 1.09159765 0.19433579
# 14 Lys 10 Lys -0.71238122 0.28212593
# 15 Leull NA Leu -0.68086189 0.89415476
# 16 Phe 12 Ser -0.05169070 0.48129061
# 17 Phe 12 Cys -0.21871795 0.06282263
# 18 lle 13 Leu -1.42723032 0.62185980
# 19 lle 13 Val 0.93924955 0.39333277
# 20 lle 13 Phe 0.71006152 0.22982191
# 21 Thr 15 Pro -0.66542079 0.66382062
其中:
class(res$X2)
# [1] "integer"
数据:
x <- c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp",
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg",
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu",
"lle13Val", "lle13Phe", "Thr15Pro")
另一个解决方案:
x <- c("Ala2Ala", "Asn3Asp", "Ser25Phe")
stringr::str_split(sub("(\d+)", ";\1;", x), ";", simplify = T)
另一种形式的解决方案:
sub("(\d+)", " \1 ", x)
我们可以使用 tidyr
中的 extract
将列拆分为多个列
library(dplyr)
library(tidyr)
df %>%
extract(x3, into = c('v1', 'v2', 'v3'),
"^([A-Za-z][a-z]+)(\d*)(\D+)", convert = TRUE) %>%
na_if("?")
-输出
x1 x2 v1 v2 v3
1 -0.545880758 0.27253736 Met 1 <NA>
2 0.536585304 0.21981567 Met 1 <NA>
3 0.419623149 0.04366575 Met 1 <NA>
4 -0.583627199 0.07509480 Ala 2 Glu
5 0.847460017 0.39408293 Ala 2 Ala
6 0.266021979 0.36396781 Asn 3 Asp
7 0.444585270 0.25830122 Asn 3 Asn
8 -0.466495124 0.33670415 Gly 4 Trp
9 -0.848370044 0.46251084 Gly 4 Arg
10 0.002311942 0.85627913 Ala 6 Glu
11 -1.316908124 0.46591567 As NA Asp
12 0.598269113 0.70118573 Arg 9 Arg
13 -0.762214370 0.54757268 Lys 10 Arg
14 -1.429090303 0.99911177 Lys 10 Lys
15 0.332244449 0.45370882 Leull NA Leu
16 -0.469060688 0.29248872 Phe 12 Ser
17 -0.334986794 0.17262897 Phe 12 Cys
18 1.536252156 0.14751666 lle 13 Leu
19 0.609994533 0.48654307 lle 13 Val
20 0.516335698 0.24613129 lle 13 Phe
21 -0.074308561 0.27913013 Thr 15 Pro
数据
structure(list(x1 = c(-0.545880758366027, 0.536585304107612,
0.419623148618683, -0.583627199210279, 0.847460017311944, 0.266021979364892,
0.444585270360416, -0.466495123565759, -0.848370043948898, 0.00231194241576697,
-1.31690812429962, 0.598269112694685, -0.7622143703459, -1.42909030324076,
0.332244449013422, -0.469060687608488, -0.334986793584065, 1.53625215550584,
0.609994533253692, 0.51633569843567, -0.0743085613231125), x2 = c(0.272537359734997,
0.219815669581294, 0.0436657541431487, 0.0750948027707636, 0.39408292947337,
0.36396780773066, 0.25830122246407, 0.336704148678109, 0.462510835379362,
0.856279134983197, 0.465915669221431, 0.701185731682926, 0.547572682844475,
0.999111766461283, 0.453708823537454, 0.292488717008382, 0.172628972679377,
0.147516664350405, 0.486543073318899, 0.246131290215999, 0.279130134964362
), x3 = c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp",
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg",
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu",
"lle13Val", "lle13Phe", "Thr15Pro")), class = "data.frame", row.names = c(NA,
-21L))
比如我的那一栏,有一行写着'Ser25Phe'
。我想拆分列 HGVS.Consequence
例如作为 'Ser 25 Phe'
...
HGVS.Consequence
Met1?
Met1?
Met1?
Ala2Glu
Ala2Ala
Asn3Asp
Asn3Asn
Gly4Trp
Gly4Arg
Ala6Glu
AsAsp
Arg9Arg
Lys10Arg
Lys10Lys
LeullLeu
Phe12Ser
Phe12Cys
lle13Leu
lle13Val
lle13Phe
Thr15Pro
使用gsub
,假设例如"AsAsp"
也应该拆分成 "As Asp"
.
trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |>
gsub(pat=' ', rep=' ') ## optional, to remove inner double whitespace
# [1] "Met 1 ?" "Met 1 ?" "Met 1 ?" "Ala 2 Glu"
# [5] "Ala 2 Ala" "Asn 3 Asp" "Asn 3 Asn" "Gly 4 Trp"
# [9] "Gly 4 Arg" "Ala 6 Glu" "As Asp" "Arg 9 Arg"
# [13] "Lys 10 Arg" "Lys 10 Lys" "Leull Leu" "Phe 12 Ser"
# [17] "Phe 12 Cys" "lle 13 Leu" "lle 13 Val" "lle 13 Phe"
# [21] "Thr 15 Pro"
编辑
如果您的列位于这样的数据框中
df <- data.frame(x1=rnorm(21), x2=runif(21), x3=x)
只需将其包裹在 transform
:
df |>
transform(x3=trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |>
gsub(pat=' ', rep=' '))
或者,可能更好,
res <- trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |>
strsplit(' ') |>
do.call(what=rbind) |>
{\(.) replace(., . %in% c('', '?'), NA)}() |>
data.frame(df[1:2]) |>
type.convert(as.is=TRUE)
res
# X1 X2 X3 x1 x2
# 1 Met 1 <NA> 1.33312448 0.83441710
# 2 Met 1 <NA> -0.55792615 0.48805921
# 3 Met 1 <NA> 1.38184166 0.73862824
# 4 Ala 2 Glu -0.87990439 0.42793122
# 5 Ala 2 Ala 0.59143575 0.23370509
# 6 Asn 3 Asp -0.15065801 0.92168932
# 7 Asn 3 Asn -1.59350802 0.58727950
# 8 Gly 4 Trp -0.21971055 0.69603185
# 9 Gly 4 Arg -0.14004599 0.36722717
# 10 Ala 6 Glu 0.31747188 0.54845522
# 11 As NA Asp -0.07593689 0.41273905
# 12 Arg 9 Arg -0.54154181 0.12890089
# 13 Lys 10 Arg 1.09159765 0.19433579
# 14 Lys 10 Lys -0.71238122 0.28212593
# 15 Leull NA Leu -0.68086189 0.89415476
# 16 Phe 12 Ser -0.05169070 0.48129061
# 17 Phe 12 Cys -0.21871795 0.06282263
# 18 lle 13 Leu -1.42723032 0.62185980
# 19 lle 13 Val 0.93924955 0.39333277
# 20 lle 13 Phe 0.71006152 0.22982191
# 21 Thr 15 Pro -0.66542079 0.66382062
其中:
class(res$X2)
# [1] "integer"
数据:
x <- c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp",
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg",
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu",
"lle13Val", "lle13Phe", "Thr15Pro")
另一个解决方案:
x <- c("Ala2Ala", "Asn3Asp", "Ser25Phe")
stringr::str_split(sub("(\d+)", ";\1;", x), ";", simplify = T)
另一种形式的解决方案:
sub("(\d+)", " \1 ", x)
我们可以使用 tidyr
中的 extract
将列拆分为多个列
library(dplyr)
library(tidyr)
df %>%
extract(x3, into = c('v1', 'v2', 'v3'),
"^([A-Za-z][a-z]+)(\d*)(\D+)", convert = TRUE) %>%
na_if("?")
-输出
x1 x2 v1 v2 v3
1 -0.545880758 0.27253736 Met 1 <NA>
2 0.536585304 0.21981567 Met 1 <NA>
3 0.419623149 0.04366575 Met 1 <NA>
4 -0.583627199 0.07509480 Ala 2 Glu
5 0.847460017 0.39408293 Ala 2 Ala
6 0.266021979 0.36396781 Asn 3 Asp
7 0.444585270 0.25830122 Asn 3 Asn
8 -0.466495124 0.33670415 Gly 4 Trp
9 -0.848370044 0.46251084 Gly 4 Arg
10 0.002311942 0.85627913 Ala 6 Glu
11 -1.316908124 0.46591567 As NA Asp
12 0.598269113 0.70118573 Arg 9 Arg
13 -0.762214370 0.54757268 Lys 10 Arg
14 -1.429090303 0.99911177 Lys 10 Lys
15 0.332244449 0.45370882 Leull NA Leu
16 -0.469060688 0.29248872 Phe 12 Ser
17 -0.334986794 0.17262897 Phe 12 Cys
18 1.536252156 0.14751666 lle 13 Leu
19 0.609994533 0.48654307 lle 13 Val
20 0.516335698 0.24613129 lle 13 Phe
21 -0.074308561 0.27913013 Thr 15 Pro
数据
structure(list(x1 = c(-0.545880758366027, 0.536585304107612,
0.419623148618683, -0.583627199210279, 0.847460017311944, 0.266021979364892,
0.444585270360416, -0.466495123565759, -0.848370043948898, 0.00231194241576697,
-1.31690812429962, 0.598269112694685, -0.7622143703459, -1.42909030324076,
0.332244449013422, -0.469060687608488, -0.334986793584065, 1.53625215550584,
0.609994533253692, 0.51633569843567, -0.0743085613231125), x2 = c(0.272537359734997,
0.219815669581294, 0.0436657541431487, 0.0750948027707636, 0.39408292947337,
0.36396780773066, 0.25830122246407, 0.336704148678109, 0.462510835379362,
0.856279134983197, 0.465915669221431, 0.701185731682926, 0.547572682844475,
0.999111766461283, 0.453708823537454, 0.292488717008382, 0.172628972679377,
0.147516664350405, 0.486543073318899, 0.246131290215999, 0.279130134964362
), x3 = c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp",
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg",
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu",
"lle13Val", "lle13Phe", "Thr15Pro")), class = "data.frame", row.names = c(NA,
-21L))