如何在一列中分隔 3 个不同的信息?

How can I separate 3 different information in a column?

比如我的那一栏,有一行写着'Ser25Phe'。我想拆分列 HGVS.Consequence 例如作为 'Ser 25 Phe'...

HGVS.Consequence
           Met1?
           Met1?
           Met1?
         Ala2Glu
         Ala2Ala
         Asn3Asp
         Asn3Asn
         Gly4Trp
         Gly4Arg
         Ala6Glu
           AsAsp
         Arg9Arg
        Lys10Arg
        Lys10Lys
        LeullLeu
        Phe12Ser
        Phe12Cys
        lle13Leu
        lle13Val
        lle13Phe
        Thr15Pro

使用gsub,假设例如"AsAsp" 也应该拆分成 "As Asp".

trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |> 
  gsub(pat='  ', rep=' ')  ## optional, to remove inner double whitespace
# [1] "Met 1 ?"    "Met 1 ?"    "Met 1 ?"    "Ala 2 Glu" 
# [5] "Ala 2 Ala"  "Asn 3 Asp"  "Asn 3 Asn"  "Gly 4 Trp" 
# [9] "Gly 4 Arg"  "Ala 6 Glu"  "As Asp"     "Arg 9 Arg" 
# [13] "Lys 10 Arg" "Lys 10 Lys" "Leull Leu"  "Phe 12 Ser"
# [17] "Phe 12 Cys" "lle 13 Leu" "lle 13 Val" "lle 13 Phe"
# [21] "Thr 15 Pro"

demo.

编辑

如果您的列位于这样的数据框中

df <- data.frame(x1=rnorm(21), x2=runif(21), x3=x)

只需将其包裹在 transform:

df |>
  transform(x3=trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |> 
  gsub(pat='  ', rep=' '))

或者,可能更好,

res <- trimws(gsub('([A-Z]?[a-z]+)(\d+)?([A-Z?]*)', '\1 \2 \3', x)) |> 
  strsplit(' ') |>
  do.call(what=rbind) |>
  {\(.) replace(., . %in% c('', '?'), NA)}() |>
  data.frame(df[1:2]) |>
  type.convert(as.is=TRUE)
res
#       X1 X2   X3          x1         x2
# 1    Met  1 <NA>  1.33312448 0.83441710
# 2    Met  1 <NA> -0.55792615 0.48805921
# 3    Met  1 <NA>  1.38184166 0.73862824
# 4    Ala  2  Glu -0.87990439 0.42793122
# 5    Ala  2  Ala  0.59143575 0.23370509
# 6    Asn  3  Asp -0.15065801 0.92168932
# 7    Asn  3  Asn -1.59350802 0.58727950
# 8    Gly  4  Trp -0.21971055 0.69603185
# 9    Gly  4  Arg -0.14004599 0.36722717
# 10   Ala  6  Glu  0.31747188 0.54845522
# 11    As NA  Asp -0.07593689 0.41273905
# 12   Arg  9  Arg -0.54154181 0.12890089
# 13   Lys 10  Arg  1.09159765 0.19433579
# 14   Lys 10  Lys -0.71238122 0.28212593
# 15 Leull NA  Leu -0.68086189 0.89415476
# 16   Phe 12  Ser -0.05169070 0.48129061
# 17   Phe 12  Cys -0.21871795 0.06282263
# 18   lle 13  Leu -1.42723032 0.62185980
# 19   lle 13  Val  0.93924955 0.39333277
# 20   lle 13  Phe  0.71006152 0.22982191
# 21   Thr 15  Pro -0.66542079 0.66382062

其中:

class(res$X2)
# [1] "integer"

数据:

x <- c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp", 
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg", 
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu", 
"lle13Val", "lle13Phe", "Thr15Pro")

另一个解决方案:

x <- c("Ala2Ala", "Asn3Asp", "Ser25Phe")
stringr::str_split(sub("(\d+)", ";\1;", x), ";", simplify = T) 

另一种形式的解决方案:

sub("(\d+)", " \1 ", x)

我们可以使用 tidyr 中的 extract 将列拆分为多个列

library(dplyr)
library(tidyr)
df %>%
  extract(x3, into = c('v1', 'v2', 'v3'), 
    "^([A-Za-z][a-z]+)(\d*)(\D+)", convert = TRUE) %>% 
  na_if("?") 

-输出

          x1         x2    v1 v2   v3
1  -0.545880758 0.27253736   Met  1 <NA>
2   0.536585304 0.21981567   Met  1 <NA>
3   0.419623149 0.04366575   Met  1 <NA>
4  -0.583627199 0.07509480   Ala  2  Glu
5   0.847460017 0.39408293   Ala  2  Ala
6   0.266021979 0.36396781   Asn  3  Asp
7   0.444585270 0.25830122   Asn  3  Asn
8  -0.466495124 0.33670415   Gly  4  Trp
9  -0.848370044 0.46251084   Gly  4  Arg
10  0.002311942 0.85627913   Ala  6  Glu
11 -1.316908124 0.46591567    As NA  Asp
12  0.598269113 0.70118573   Arg  9  Arg
13 -0.762214370 0.54757268   Lys 10  Arg
14 -1.429090303 0.99911177   Lys 10  Lys
15  0.332244449 0.45370882 Leull NA  Leu
16 -0.469060688 0.29248872   Phe 12  Ser
17 -0.334986794 0.17262897   Phe 12  Cys
18  1.536252156 0.14751666   lle 13  Leu
19  0.609994533 0.48654307   lle 13  Val
20  0.516335698 0.24613129   lle 13  Phe
21 -0.074308561 0.27913013   Thr 15  Pro

数据

structure(list(x1 = c(-0.545880758366027, 0.536585304107612, 
0.419623148618683, -0.583627199210279, 0.847460017311944, 0.266021979364892, 
0.444585270360416, -0.466495123565759, -0.848370043948898, 0.00231194241576697, 
-1.31690812429962, 0.598269112694685, -0.7622143703459, -1.42909030324076, 
0.332244449013422, -0.469060687608488, -0.334986793584065, 1.53625215550584, 
0.609994533253692, 0.51633569843567, -0.0743085613231125), x2 = c(0.272537359734997, 
0.219815669581294, 0.0436657541431487, 0.0750948027707636, 0.39408292947337, 
0.36396780773066, 0.25830122246407, 0.336704148678109, 0.462510835379362, 
0.856279134983197, 0.465915669221431, 0.701185731682926, 0.547572682844475, 
0.999111766461283, 0.453708823537454, 0.292488717008382, 0.172628972679377, 
0.147516664350405, 0.486543073318899, 0.246131290215999, 0.279130134964362
), x3 = c("Met1?", "Met1?", "Met1?", "Ala2Glu", "Ala2Ala", "Asn3Asp", 
"Asn3Asn", "Gly4Trp", "Gly4Arg", "Ala6Glu", "AsAsp", "Arg9Arg", 
"Lys10Arg", "Lys10Lys", "LeullLeu", "Phe12Ser", "Phe12Cys", "lle13Leu", 
"lle13Val", "lle13Phe", "Thr15Pro")), class = "data.frame", row.names = c(NA, 
-21L))