从文本列中提取的单独列表分为多个数字列
Separate list extracted from text column into multiple numeric columns
我有一个文本列,我想将其数字分成几列:
df <- structure(list(text = c("((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))",
"((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))", "((i: DH=1, SZ=0, RM=0, FO=NA, GP=0, BC=0))",
"((i: DH=1, SZ=0, RM=NA, FO=0, GP=0, BC=0))", "((i: DH=1, SZ=1, RM=0, FO=0, GP=0, BC=0))",
"((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=0, RM=1, FO=0, GP=0, BC=1))",
"((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))",
"((i: DH=0, SZ=1, RM=0, FO=1, GP=0, BC=0))", "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=1))",
"((i: DH=0, SZ=NA, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))"
)), row.names = c(NA, -13L), class = "data.frame")
我可以 str_extract
数字,但无法使用 tidyr
的函数 separate
:
将数字分成数字列
library(stringr)
library(tidyr)
library(dplyr)
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)")) %>%
separate(col = text,
into = c("DH", "SZ", "RM", "FO", "GP", "BC"),
sep = ", ",
convert = TRUE)
DH SZ RM FO GP BC
1 c("0" "0" "0" "0" "0" "0")
2 c("0" "0" "0" "0" "0" "0")
3 c("1" "0" "0" "NA" "0" "0")
4 c("1" "0" "NA" "0" "0" "0")
5 c("1" "1" "0" "0" "0" "0")
6 c("0" "1" "0" "0" "0" "0")
7 c("0" "0" "1" "0" "0" "1")
8 c("0" "1" "0" "0" "0" "0")
9 c("0" "1" "0" "0" "0" "0")
10 c("0" "1" "0" "1" "0" "0")
11 c("0" "1" "0" "0" "0" "1")
12 c("0" "NA" "0" "0" "0" "0")
13 c("0" "0" "0" "0" "0" "0")
我的直觉是,这是因为 str_extract_all
returns list 和 separate
中的数字在列表中不起作用。这里的解决方案是什么?
这就是 simplify=T
参数的目的,因为它将输出强制转换为矩阵。
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)", simplify=T))
text.1 text.2 text.3 text.4 text.5 text.6
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 1 0 0 NA 0 0
4 1 0 NA 0 0 0
5 1 1 0 0 0 0
6 0 1 0 0 0 0
7 0 0 1 0 0 1
8 0 1 0 0 0 0
9 0 1 0 0 0 0
10 0 1 0 1 0 0
11 0 1 0 0 0 1
12 0 NA 0 0 0 0
13 0 0 0 0 0 0
您可以使用 unnest_wider
而不是 separate
,然后再做一些额外的 tidyverse 魔术。注意:您会收到警告,因为 NA 是由强制引入的,但这在您的情况下是预期的。
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)")) %>%
unnest_wider(text) %>%
mutate(across(starts_with("..."), ~as.numeric(.))) %>%
rename_with(.cols = starts_with("..."),
.fn = ~c("DH", "SZ", "RM", "FO", "GP", "BC"))
给出:
# A tibble: 13 x 6
DH SZ RM FO GP BC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 1 0 0 NA 0 0
4 1 0 NA 0 0 0
5 1 1 0 0 0 0
6 0 1 0 0 0 0
7 0 0 1 0 0 1
8 0 1 0 0 0 0
9 0 1 0 0 0 0
10 0 1 0 1 0 0
11 0 1 0 0 0 1
12 0 NA 0 0 0 0
13 0 0 0 0 0 0
另一种方法(在重命名列方面)是直接在列表中设置名称属性,这解决了所有重命名问题:
library(tidyverse)
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)"),
text = map(text, setNames, c("DH", "SZ", "RM", "FO", "GP", "BC"))) %>%
unnest_wider(text) %>%
mutate(across(c("DH", "SZ", "RM", "FO", "GP", "BC"), ~as.numeric(.)))
我有一个文本列,我想将其数字分成几列:
df <- structure(list(text = c("((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))",
"((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))", "((i: DH=1, SZ=0, RM=0, FO=NA, GP=0, BC=0))",
"((i: DH=1, SZ=0, RM=NA, FO=0, GP=0, BC=0))", "((i: DH=1, SZ=1, RM=0, FO=0, GP=0, BC=0))",
"((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=0, RM=1, FO=0, GP=0, BC=1))",
"((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))",
"((i: DH=0, SZ=1, RM=0, FO=1, GP=0, BC=0))", "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=1))",
"((i: DH=0, SZ=NA, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))"
)), row.names = c(NA, -13L), class = "data.frame")
我可以 str_extract
数字,但无法使用 tidyr
的函数 separate
:
library(stringr)
library(tidyr)
library(dplyr)
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)")) %>%
separate(col = text,
into = c("DH", "SZ", "RM", "FO", "GP", "BC"),
sep = ", ",
convert = TRUE)
DH SZ RM FO GP BC
1 c("0" "0" "0" "0" "0" "0")
2 c("0" "0" "0" "0" "0" "0")
3 c("1" "0" "0" "NA" "0" "0")
4 c("1" "0" "NA" "0" "0" "0")
5 c("1" "1" "0" "0" "0" "0")
6 c("0" "1" "0" "0" "0" "0")
7 c("0" "0" "1" "0" "0" "1")
8 c("0" "1" "0" "0" "0" "0")
9 c("0" "1" "0" "0" "0" "0")
10 c("0" "1" "0" "1" "0" "0")
11 c("0" "1" "0" "0" "0" "1")
12 c("0" "NA" "0" "0" "0" "0")
13 c("0" "0" "0" "0" "0" "0")
我的直觉是,这是因为 str_extract_all
returns list 和 separate
中的数字在列表中不起作用。这里的解决方案是什么?
这就是 simplify=T
参数的目的,因为它将输出强制转换为矩阵。
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)", simplify=T))
text.1 text.2 text.3 text.4 text.5 text.6
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 1 0 0 NA 0 0
4 1 0 NA 0 0 0
5 1 1 0 0 0 0
6 0 1 0 0 0 0
7 0 0 1 0 0 1
8 0 1 0 0 0 0
9 0 1 0 0 0 0
10 0 1 0 1 0 0
11 0 1 0 0 0 1
12 0 NA 0 0 0 0
13 0 0 0 0 0 0
您可以使用 unnest_wider
而不是 separate
,然后再做一些额外的 tidyverse 魔术。注意:您会收到警告,因为 NA 是由强制引入的,但这在您的情况下是预期的。
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)")) %>%
unnest_wider(text) %>%
mutate(across(starts_with("..."), ~as.numeric(.))) %>%
rename_with(.cols = starts_with("..."),
.fn = ~c("DH", "SZ", "RM", "FO", "GP", "BC"))
给出:
# A tibble: 13 x 6
DH SZ RM FO GP BC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 1 0 0 NA 0 0
4 1 0 NA 0 0 0
5 1 1 0 0 0 0
6 0 1 0 0 0 0
7 0 0 1 0 0 1
8 0 1 0 0 0 0
9 0 1 0 0 0 0
10 0 1 0 1 0 0
11 0 1 0 0 0 1
12 0 NA 0 0 0 0
13 0 0 0 0 0 0
另一种方法(在重命名列方面)是直接在列表中设置名称属性,这解决了所有重命名问题:
library(tidyverse)
df %>%
mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)"),
text = map(text, setNames, c("DH", "SZ", "RM", "FO", "GP", "BC"))) %>%
unnest_wider(text) %>%
mutate(across(c("DH", "SZ", "RM", "FO", "GP", "BC"), ~as.numeric(.)))