在 R 中按查找 table 中的值拆分列
Splitting columns by values in a lookup table in R
我有一个 table,每个 hpo_term 一行,所以一个病人每个 ID 可以有很多行。
ID hpo_term
123 kidney failure
123 hand tremor
123 kidney transplant
432 hypertension
432 exotropia
432 scissor gait
我还有另外两个 tables,一个是肾脏术语,另一个是非肾脏术语,肾脏的看起来像这样:
kidney failure
kidney transplant
hypertension
非肾脏的长这样:
hand tremor
exotropia
scissor gait
我想要的结果是 table 这样的:
ID kidney_hpo_term non_kidney_hpo_term
123 kidney failure;kidney transplant hand tremor
432 hypertension exotropia;scissor gait
现实中有成百上千的患者和成百上千的 HPO 术语。
我可以访问基础 R; dplyr 但我真的不知道如何解决这个问题。
非常感谢您的帮助。
非常感谢
编辑:
真正的 table1 有更多不相关的额外列,并且每个 ID 始终相同,我也想将其导入。例如:
ID hpo_term year_of_birth affected_relative genome
123 kidney failure 2000 Y 38
123 hand tremor 2000 Y 38
123 kidney transplant 2000 Y 38
432 hypertension 1980 N 37
432 exotropia 1980 N 37
432 scissor gait 1980 N 37
这是一个 dplyr 解决方案:
library(dplyr)
table1 = data.frame(ID = c(123,123,123,432,432,432),
hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))
kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")
table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"
table2 = table1 %>% group_by(ID,term_type) %>%
summarize(term_list=paste(hpo_term,collapse=";")) %>%
spread(term_type,term_list)
> table2
ID kidney_hpo_term non_kidney_hpo_term
1 123 kidney failure;kidney transplant hand tremor
2 432 hypertension exotropia;scissor gait
这是一个 data.table
解决方案:
library(data.table)
table1 = data.table(ID = c(123,123,123,432,432,432),
hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))
kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")
table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"
table2 = table1[,.(term_list=paste(hpo_term,collapse=";")),by=.(ID,term_type)]
table3 = dcast(table2, ID~term_type, value.var = "term_list")
> table3
ID kidney_hpo_term non_kidney_hpo_term
1: 123 kidney failure;kidney transplant hand tremor
2: 432 hypertension exotropia;scissor gait
library(dplyr); library(tidyr)
patients %>%
left_join(terms) %>%
group_by(ID, type) %>%
summarize(ID.hpo_term = paste(ID.hpo_term, collapse = ", "), .groups = "drop") %>%
tidyr::pivot_wider(names_from = type, values_from = ID.hpo_term)
结果
Joining, by = "ID.hpo_term"
# A tibble: 2 x 3
ID kidney_hpo_term non_kidney_hpo_term
<dbl> <chr> <chr>
1 123 kidney failure, kidney transplant hand tremor
2 432 hypertension exotropia, scissor gait
输入数据
patients <- data.frame(
stringsAsFactors = FALSE,
ID = c(123, 123, 123, 432, 432, 432),
ID.hpo_term = c("kidney failure",
"hand tremor","kidney transplant","hypertension",
"exotropia","scissor gait")
)
terms <- data.frame(
stringsAsFactors = FALSE,
type = rep(c("kidney_hpo_term", "non_kidney_hpo_term"), each = 3),
ID.hpo_term = c("kidney failure", "kidney transplant",
"hypertension",
"hand tremor","exotropia","scissor gait")
这里有一个不同的方法,使用tidyr::pivot_wider
使用values_fn
来总结而不是单独做:
library(dplyr); library(tidyr)
pt.data %>%
mutate(kidney = hpo_term %in% kidney.hpo) %>%
pivot_wider(names_from = kidney, values_from = hpo_term,
values_fn = function(x)paste(x,collapse = ";"), values_fill = NA) %>%
setNames(c("ID","Kidney","Non.kidney"))
## A tibble: 2 x 3
# ID Kidney Non.kidney
# <int> <chr> <chr>
#1 123 kidney failure;kidney transplant hand tremor
#2 432 hypertension exotropia;scissor gait
数据:
pt.data <- structure(list(ID = c(123L, 123L, 123L, 432L, 432L, 432L), hpo_term = c("kidney failure", "hand tremor", "kidney transplant", "hypertension", "exotropia", "scissor gait")), class = "data.frame", row.names = c(NA, -6L))
kidney.hpo <- c("kidney failure", "kidney transplant", "hypertension")
我有一个 table,每个 hpo_term 一行,所以一个病人每个 ID 可以有很多行。
ID hpo_term
123 kidney failure
123 hand tremor
123 kidney transplant
432 hypertension
432 exotropia
432 scissor gait
我还有另外两个 tables,一个是肾脏术语,另一个是非肾脏术语,肾脏的看起来像这样:
kidney failure
kidney transplant
hypertension
非肾脏的长这样:
hand tremor
exotropia
scissor gait
我想要的结果是 table 这样的:
ID kidney_hpo_term non_kidney_hpo_term
123 kidney failure;kidney transplant hand tremor
432 hypertension exotropia;scissor gait
现实中有成百上千的患者和成百上千的 HPO 术语。
我可以访问基础 R; dplyr 但我真的不知道如何解决这个问题。
非常感谢您的帮助。
非常感谢
编辑:
真正的 table1 有更多不相关的额外列,并且每个 ID 始终相同,我也想将其导入。例如:
ID hpo_term year_of_birth affected_relative genome
123 kidney failure 2000 Y 38
123 hand tremor 2000 Y 38
123 kidney transplant 2000 Y 38
432 hypertension 1980 N 37
432 exotropia 1980 N 37
432 scissor gait 1980 N 37
这是一个 dplyr 解决方案:
library(dplyr)
table1 = data.frame(ID = c(123,123,123,432,432,432),
hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))
kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")
table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"
table2 = table1 %>% group_by(ID,term_type) %>%
summarize(term_list=paste(hpo_term,collapse=";")) %>%
spread(term_type,term_list)
> table2
ID kidney_hpo_term non_kidney_hpo_term
1 123 kidney failure;kidney transplant hand tremor
2 432 hypertension exotropia;scissor gait
这是一个 data.table
解决方案:
library(data.table)
table1 = data.table(ID = c(123,123,123,432,432,432),
hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))
kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")
table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"
table2 = table1[,.(term_list=paste(hpo_term,collapse=";")),by=.(ID,term_type)]
table3 = dcast(table2, ID~term_type, value.var = "term_list")
> table3
ID kidney_hpo_term non_kidney_hpo_term
1: 123 kidney failure;kidney transplant hand tremor
2: 432 hypertension exotropia;scissor gait
library(dplyr); library(tidyr)
patients %>%
left_join(terms) %>%
group_by(ID, type) %>%
summarize(ID.hpo_term = paste(ID.hpo_term, collapse = ", "), .groups = "drop") %>%
tidyr::pivot_wider(names_from = type, values_from = ID.hpo_term)
结果
Joining, by = "ID.hpo_term"
# A tibble: 2 x 3
ID kidney_hpo_term non_kidney_hpo_term
<dbl> <chr> <chr>
1 123 kidney failure, kidney transplant hand tremor
2 432 hypertension exotropia, scissor gait
输入数据
patients <- data.frame(
stringsAsFactors = FALSE,
ID = c(123, 123, 123, 432, 432, 432),
ID.hpo_term = c("kidney failure",
"hand tremor","kidney transplant","hypertension",
"exotropia","scissor gait")
)
terms <- data.frame(
stringsAsFactors = FALSE,
type = rep(c("kidney_hpo_term", "non_kidney_hpo_term"), each = 3),
ID.hpo_term = c("kidney failure", "kidney transplant",
"hypertension",
"hand tremor","exotropia","scissor gait")
这里有一个不同的方法,使用tidyr::pivot_wider
使用values_fn
来总结而不是单独做:
library(dplyr); library(tidyr)
pt.data %>%
mutate(kidney = hpo_term %in% kidney.hpo) %>%
pivot_wider(names_from = kidney, values_from = hpo_term,
values_fn = function(x)paste(x,collapse = ";"), values_fill = NA) %>%
setNames(c("ID","Kidney","Non.kidney"))
## A tibble: 2 x 3
# ID Kidney Non.kidney
# <int> <chr> <chr>
#1 123 kidney failure;kidney transplant hand tremor
#2 432 hypertension exotropia;scissor gait
数据:
pt.data <- structure(list(ID = c(123L, 123L, 123L, 432L, 432L, 432L), hpo_term = c("kidney failure", "hand tremor", "kidney transplant", "hypertension", "exotropia", "scissor gait")), class = "data.frame", row.names = c(NA, -6L))
kidney.hpo <- c("kidney failure", "kidney transplant", "hypertension")