在 R 中按查找 table 中的值拆分列

Question

我有一个 table，每个 hpo_term 一行，所以一个病人每个 ID 可以有很多行。

ID hpo_term
123 kidney failure
123 hand tremor
123 kidney transplant
432 hypertension
432 exotropia
432 scissor gait

我还有另外两个 tables，一个是肾脏术语，另一个是非肾脏术语，肾脏的看起来像这样：

kidney failure
kidney transplant
hypertension

非肾脏的长这样：

hand tremor
exotropia
scissor gait

我想要的结果是 table 这样的：

ID kidney_hpo_term                   non_kidney_hpo_term
123 kidney failure;kidney transplant hand tremor
432 hypertension                     exotropia;scissor gait

现实中有成百上千的患者和成百上千的 HPO 术语。

我可以访问基础 R； dplyr 但我真的不知道如何解决这个问题。

非常感谢您的帮助。

非常感谢

编辑：

真正的 table1 有更多不相关的额外列，并且每个 ID 始终相同，我也想将其导入。例如：

 ID hpo_term              year_of_birth  affected_relative   genome
    123 kidney failure    2000               Y                38
    123 hand tremor       2000               Y                38
    123 kidney transplant 2000               Y                38
    432 hypertension      1980               N                37
    432 exotropia         1980               N                37
    432 scissor gait      1980               N                37

Answer 1

这是一个 dplyr 解决方案：

library(dplyr)

table1 = data.frame(ID = c(123,123,123,432,432,432),
                    hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))

kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")

table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"

table2 = table1 %>% group_by(ID,term_type) %>%
  summarize(term_list=paste(hpo_term,collapse=";")) %>%
  spread(term_type,term_list)

> table2
    ID kidney_hpo_term                  non_kidney_hpo_term   
1   123 kidney failure;kidney transplant hand tremor           
2   432 hypertension                     exotropia;scissor gait

这是一个 data.table 解决方案：

library(data.table)

table1 = data.table(ID = c(123,123,123,432,432,432),
                    hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))

kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")

table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"

table2 = table1[,.(term_list=paste(hpo_term,collapse=";")),by=.(ID,term_type)]

table3 = dcast(table2, ID~term_type, value.var = "term_list")

> table3
    ID                  kidney_hpo_term    non_kidney_hpo_term
1: 123 kidney failure;kidney transplant            hand tremor
2: 432                     hypertension exotropia;scissor gait

Answer 2

library(dplyr); library(tidyr)
patients %>%
  left_join(terms) %>%
  group_by(ID, type) %>%
  summarize(ID.hpo_term = paste(ID.hpo_term, collapse = ", "), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = type, values_from = ID.hpo_term)

结果

Joining, by = "ID.hpo_term"
# A tibble: 2 x 3
     ID kidney_hpo_term                   non_kidney_hpo_term    
  <dbl> <chr>                             <chr>                  
1   123 kidney failure, kidney transplant hand tremor            
2   432 hypertension                      exotropia, scissor gait

输入数据

patients <- data.frame(
  stringsAsFactors = FALSE,
  ID = c(123, 123, 123, 432, 432, 432),
       ID.hpo_term = c("kidney failure",
                       "hand tremor","kidney transplant","hypertension",
                       "exotropia","scissor gait")
)


terms <- data.frame(
  stringsAsFactors = FALSE,
  type = rep(c("kidney_hpo_term", "non_kidney_hpo_term"), each = 3),
  ID.hpo_term = c("kidney failure", "kidney transplant",
                       "hypertension",
                       "hand tremor","exotropia","scissor gait")

Answer 3

这里有一个不同的方法，使用tidyr::pivot_wider使用values_fn来总结而不是单独做：

library(dplyr); library(tidyr)
pt.data %>% 
   mutate(kidney = hpo_term %in% kidney.hpo) %>%
   pivot_wider(names_from = kidney, values_from = hpo_term,
               values_fn = function(x)paste(x,collapse = ";"), values_fill = NA) %>%
   setNames(c("ID","Kidney","Non.kidney"))
## A tibble: 2 x 3
#     ID Kidney                           Non.kidney            
#  <int> <chr>                            <chr>                 
#1   123 kidney failure;kidney transplant hand tremor           
#2   432 hypertension                     exotropia;scissor gait

数据：

pt.data <- structure(list(ID = c(123L, 123L, 123L, 432L, 432L, 432L), hpo_term = c("kidney failure", "hand tremor", "kidney transplant", "hypertension", "exotropia", "scissor gait")), class = "data.frame", row.names = c(NA, -6L))
kidney.hpo <- c("kidney failure", "kidney transplant", "hypertension")

在 R 中按查找 table 中的值拆分列

Splitting columns by values in a lookup table in R

lookup

merge

r

data.table