在 R 中按查找 table 中的值拆分列

Splitting columns by values in a lookup table in R

我有一个 table,每个 hpo_term 一行,所以一个病人每个 ID 可以有很多行。

ID hpo_term
123 kidney failure
123 hand tremor
123 kidney transplant
432 hypertension
432 exotropia
432 scissor gait

我还有另外两个 tables,一个是肾脏术语,另一个是非肾脏术语,肾脏的看起来像这样:

kidney failure
kidney transplant
hypertension

非肾脏的长这样:

hand tremor
exotropia
scissor gait

我想要的结果是 table 这样的:

ID kidney_hpo_term                   non_kidney_hpo_term
123 kidney failure;kidney transplant hand tremor
432 hypertension                     exotropia;scissor gait

现实中有成百上千的患者和成百上千的 HPO 术语。

我可以访问基础 R; dplyr 但我真的不知道如何解决这个问题。

非常感谢您的帮助。

非常感谢

编辑:

真正的 table1 有更多不相关的额外列,并且每个 ID 始终相同,我也想将其导入。例如:

 ID hpo_term              year_of_birth  affected_relative   genome
    123 kidney failure    2000               Y                38
    123 hand tremor       2000               Y                38
    123 kidney transplant 2000               Y                38
    432 hypertension      1980               N                37
    432 exotropia         1980               N                37
    432 scissor gait      1980               N                37

这是一个 dplyr 解决方案:

library(dplyr)

table1 = data.frame(ID = c(123,123,123,432,432,432),
                    hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))

kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")

table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"

table2 = table1 %>% group_by(ID,term_type) %>%
  summarize(term_list=paste(hpo_term,collapse=";")) %>%
  spread(term_type,term_list)

> table2
    ID kidney_hpo_term                  non_kidney_hpo_term   
1   123 kidney failure;kidney transplant hand tremor           
2   432 hypertension                     exotropia;scissor gait

这是一个 data.table 解决方案:

library(data.table)

table1 = data.table(ID = c(123,123,123,432,432,432),
                    hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))

kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")

table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"

table2 = table1[,.(term_list=paste(hpo_term,collapse=";")),by=.(ID,term_type)]

table3 = dcast(table2, ID~term_type, value.var = "term_list")

> table3
    ID                  kidney_hpo_term    non_kidney_hpo_term
1: 123 kidney failure;kidney transplant            hand tremor
2: 432                     hypertension exotropia;scissor gait
library(dplyr); library(tidyr)
patients %>%
  left_join(terms) %>%
  group_by(ID, type) %>%
  summarize(ID.hpo_term = paste(ID.hpo_term, collapse = ", "), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = type, values_from = ID.hpo_term)

结果

Joining, by = "ID.hpo_term"
# A tibble: 2 x 3
     ID kidney_hpo_term                   non_kidney_hpo_term    
  <dbl> <chr>                             <chr>                  
1   123 kidney failure, kidney transplant hand tremor            
2   432 hypertension                      exotropia, scissor gait

输入数据

patients <- data.frame(
  stringsAsFactors = FALSE,
  ID = c(123, 123, 123, 432, 432, 432),
       ID.hpo_term = c("kidney failure",
                       "hand tremor","kidney transplant","hypertension",
                       "exotropia","scissor gait")
)


terms <- data.frame(
  stringsAsFactors = FALSE,
  type = rep(c("kidney_hpo_term", "non_kidney_hpo_term"), each = 3),
  ID.hpo_term = c("kidney failure", "kidney transplant",
                       "hypertension",
                       "hand tremor","exotropia","scissor gait")

这里有一个不同的方法,使用tidyr::pivot_wider使用values_fn来总结而不是单独做:

library(dplyr); library(tidyr)
pt.data %>% 
   mutate(kidney = hpo_term %in% kidney.hpo) %>%
   pivot_wider(names_from = kidney, values_from = hpo_term,
               values_fn = function(x)paste(x,collapse = ";"), values_fill = NA) %>%
   setNames(c("ID","Kidney","Non.kidney"))
## A tibble: 2 x 3
#     ID Kidney                           Non.kidney            
#  <int> <chr>                            <chr>                 
#1   123 kidney failure;kidney transplant hand tremor           
#2   432 hypertension                     exotropia;scissor gait

数据:

pt.data <- structure(list(ID = c(123L, 123L, 123L, 432L, 432L, 432L), hpo_term = c("kidney failure", "hand tremor", "kidney transplant", "hypertension", "exotropia", "scissor gait")), class = "data.frame", row.names = c(NA, -6L))
kidney.hpo <- c("kidney failure", "kidney transplant", "hypertension")