取消列出数据框列并将它们粘贴在一起
Unlist data frame column and pasting them together
我有一个定义如下的数据框:
df <- structure(list(ID = 1:19, MEDICATION = c("0", "NOVOMIX 26 BF, 20 D",
"NOVOMIX 14 D", "NOVOMIX 34 BF 22 D", "MIXTARD 52 BF 20 D", "MIXTARD 40 BF 24 D",
"MIXTARD 10 BF 8 D", "MIXTARD 42 BF 24 D", "MIXTARD 20 BF 18 D",
"MIXTARD 82 BF 46 D", "MIXTARD 14 BF 10 D", "NOVOMIX 15 BF 15 D",
"MIXTARD", NA, "MIXTARD 10 BF 4 D", "NOVOMIX", "MIXTARD --> NOVOMIX",
"NOT GIVEN ANY DIABETES MEDICATION INPATIENT PATIENT NORMALLY ON METFORMIN",
"GIVEN ASPART")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -19L), .Names = c("ID", "MEDICATION"))
我想从数据框中的 MEDICATION
变量中提取所有药物(即 NOVOMIX
、MIXTARD
、METFORMIN
、ASPART
和把它们贴在一起,我的代码是这样写的:
library(tidyverse)
library(rebus)
df %>%
mutate(MEDICATION2 = str_extract_all(MEDICATION, pattern =
or1(c("NOVOMIX", "MIXTARD", "METFORMIN", "ASPART")))) %>%
unnest(MEDICATION2) %>%
group_by(ID) %>%
mutate(MEDICATION2 = str_c(unlist(MEDICATION2), collapse = " - ")) %>%
slice(1)
我的预期输出是:
df_out <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19), MEDICATION = c("0", "NOVOMIX 26 BF, 20 D",
"NOVOMIX 14 D", "NOVOMIX 34 BF 22 D", "MIXTARD 52 BF 20 D", "MIXTARD 40 BF 24 D",
"MIXTARD 10 BF 8 D", "MIXTARD 42 BF 24 D", "MIXTARD 20 BF 18 D",
"MIXTARD 82 BF 46 D", "MIXTARD 14 BF 10 D", "NOVOMIX 15 BF 15 D",
"MIXTARD", NA, "MIXTARD 10 BF 4 D", "NOVOMIX", "MIXTARD --> NOVOMIX",
"NOT GIVEN ANY DIABETES MEDICATION INPATIENT PATIENT NORMALLY ON METFORMIN",
"GIVEN ASPART"), MEDICATION2 = c(NA, "NOVOMIX", "NOVOMIX", "NOVOMIX",
"MIXTARD", "MIXTARD", "MIXTARD", "MIXTARD", "MIXTARD", "MIXTARD",
"MIXTARD", "NOVOMIX", "MIXTARD", NA, "MIXTARD", "NOVOMIX", "MIXTARD - NOVOMIX",
"METFORMIN", "ASPART")), .Names = c("ID", "MEDICATION", "MEDICATION2"
), row.names = c(NA, -19L), class = "data.frame")
问题是代码删除了带有 MEDICATION == 0
的行,我认为我的代码对于简单的字符串提取来说太长了。如果您知道如何缩短此代码(如果可能),我想寻求帮助。
我们可以使用 stringi
包中的 stri_extract_all_regex
来提取所有匹配模式的单词。
library(stringi)
med_pattern <- c("NOVOMIX|MIXTARD|METFORMIN|ASPART")
df$MEDICATION2 <- stri_extract_all_regex(df$MEDICATION, pattern = med_pattern)
如@mt1022 所述,新列是一个列表。我们可以 paste
他们和
df$MEDICATION2<-paste(stri_extract_all_regex(df$MEDICATION,pattern = med_pattern))
但是,它不会为包含 1 个以上元素的列表提供一些不需要的字符。这应该会给你预期的输出。
chars <- stri_extract_all_regex(df$MEDICATION, pattern = med_pattern)
df$MEDICATION2 <- sapply(chars, paste, collapse = "-")
df$MEDICATION2
#[1] "NA" "NOVOMIX" "NOVOMIX" "NOVOMIX"
#[5] "MIXTARD" "MIXTARD" "MIXTARD" "MIXTARD"
#[9] "MIXTARD" "MIXTARD" "MIXTARD" "NOVOMIX"
#[13] "MIXTARD" "NA" "MIXTARD" "NOVOMIX"
#[17] "MIXTARD-NOVOMIX" "METFORMIN" "ASPART"
您也可以在一行中执行此操作:
df$MEDICATION2 <- sapply(stri_extract_all_regex(df$MEDICATION,
pattern = med_pattern), paste, collapse = "-")
我有一个定义如下的数据框:
df <- structure(list(ID = 1:19, MEDICATION = c("0", "NOVOMIX 26 BF, 20 D",
"NOVOMIX 14 D", "NOVOMIX 34 BF 22 D", "MIXTARD 52 BF 20 D", "MIXTARD 40 BF 24 D",
"MIXTARD 10 BF 8 D", "MIXTARD 42 BF 24 D", "MIXTARD 20 BF 18 D",
"MIXTARD 82 BF 46 D", "MIXTARD 14 BF 10 D", "NOVOMIX 15 BF 15 D",
"MIXTARD", NA, "MIXTARD 10 BF 4 D", "NOVOMIX", "MIXTARD --> NOVOMIX",
"NOT GIVEN ANY DIABETES MEDICATION INPATIENT PATIENT NORMALLY ON METFORMIN",
"GIVEN ASPART")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -19L), .Names = c("ID", "MEDICATION"))
我想从数据框中的 MEDICATION
变量中提取所有药物(即 NOVOMIX
、MIXTARD
、METFORMIN
、ASPART
和把它们贴在一起,我的代码是这样写的:
library(tidyverse)
library(rebus)
df %>%
mutate(MEDICATION2 = str_extract_all(MEDICATION, pattern =
or1(c("NOVOMIX", "MIXTARD", "METFORMIN", "ASPART")))) %>%
unnest(MEDICATION2) %>%
group_by(ID) %>%
mutate(MEDICATION2 = str_c(unlist(MEDICATION2), collapse = " - ")) %>%
slice(1)
我的预期输出是:
df_out <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19), MEDICATION = c("0", "NOVOMIX 26 BF, 20 D",
"NOVOMIX 14 D", "NOVOMIX 34 BF 22 D", "MIXTARD 52 BF 20 D", "MIXTARD 40 BF 24 D",
"MIXTARD 10 BF 8 D", "MIXTARD 42 BF 24 D", "MIXTARD 20 BF 18 D",
"MIXTARD 82 BF 46 D", "MIXTARD 14 BF 10 D", "NOVOMIX 15 BF 15 D",
"MIXTARD", NA, "MIXTARD 10 BF 4 D", "NOVOMIX", "MIXTARD --> NOVOMIX",
"NOT GIVEN ANY DIABETES MEDICATION INPATIENT PATIENT NORMALLY ON METFORMIN",
"GIVEN ASPART"), MEDICATION2 = c(NA, "NOVOMIX", "NOVOMIX", "NOVOMIX",
"MIXTARD", "MIXTARD", "MIXTARD", "MIXTARD", "MIXTARD", "MIXTARD",
"MIXTARD", "NOVOMIX", "MIXTARD", NA, "MIXTARD", "NOVOMIX", "MIXTARD - NOVOMIX",
"METFORMIN", "ASPART")), .Names = c("ID", "MEDICATION", "MEDICATION2"
), row.names = c(NA, -19L), class = "data.frame")
问题是代码删除了带有 MEDICATION == 0
的行,我认为我的代码对于简单的字符串提取来说太长了。如果您知道如何缩短此代码(如果可能),我想寻求帮助。
我们可以使用 stringi
包中的 stri_extract_all_regex
来提取所有匹配模式的单词。
library(stringi)
med_pattern <- c("NOVOMIX|MIXTARD|METFORMIN|ASPART")
df$MEDICATION2 <- stri_extract_all_regex(df$MEDICATION, pattern = med_pattern)
如@mt1022 所述,新列是一个列表。我们可以 paste
他们和
df$MEDICATION2<-paste(stri_extract_all_regex(df$MEDICATION,pattern = med_pattern))
但是,它不会为包含 1 个以上元素的列表提供一些不需要的字符。这应该会给你预期的输出。
chars <- stri_extract_all_regex(df$MEDICATION, pattern = med_pattern)
df$MEDICATION2 <- sapply(chars, paste, collapse = "-")
df$MEDICATION2
#[1] "NA" "NOVOMIX" "NOVOMIX" "NOVOMIX"
#[5] "MIXTARD" "MIXTARD" "MIXTARD" "MIXTARD"
#[9] "MIXTARD" "MIXTARD" "MIXTARD" "NOVOMIX"
#[13] "MIXTARD" "NA" "MIXTARD" "NOVOMIX"
#[17] "MIXTARD-NOVOMIX" "METFORMIN" "ASPART"
您也可以在一行中执行此操作:
df$MEDICATION2 <- sapply(stri_extract_all_regex(df$MEDICATION,
pattern = med_pattern), paste, collapse = "-")