使用 dplyr 重新编码变量

Question

我的数据集中有肺栓塞的 ICD 9 代码，从 DX1 列到 DX25 列，我想在我的数据集 (xs) 中使用 ICD 9 代码为肺栓塞 (PE) 创建一个单独的列。因此，如果 ICD 9 代码存在于从 DX1 到 DX25 的任何地方，我想将其编码为 1 else 0。我也不想使用 for 循环，因为我的数据集有 30,000 行并且数据集为 7GB，所以它使我的计算机崩溃. 目前我正在使用以下代码执行此操作：

xs$PE = NA

xs[which(xs$DX1%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX2%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX3%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1      
xs[which(xs$DX4%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX5%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1           
xs[which(xs$DX6%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX7%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1           
xs[which(xs$DX8%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1           
xs[which(xs$DX9%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1           
xs[which(xs$DX10%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX11%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX12%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX13%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX14%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX15%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX16%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX17%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1          
xs[which(xs$DX18%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1     
xs[which(xs$DX19%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX20%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX21%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1  
xs[which(xs$DX22%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1          
xs[which(xs$DX23%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1          
xs[which(xs$DX24%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1          
xs[which(xs$DX25%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1  

xs <- xs %>%
      mutate(PE = ifelse(is.na(PE),0,PE))

谁能用更聪明的方法帮助我使用 dplyr 重新编码？[我不想输入 100 行代码，因为我还有其他诊断需要编码并创建新列]

Answer 1

当我过去做过类似的工作时，我结合使用了 tidyr 和 dplyr。

我的第一步是重塑长，然后寻找诊断。然后可以将数据集减少为每个 id 一行。如果需要，可以将其合并回原始数据集。但是，我还没有对像您这样大的数据集进行过尝试。

library(tidyr)
library(dplyr)

dat <- data.frame(id = c(1,2,3), 
                  diagnosis1 = c("001", "001", "005"), 
                  diagnosis2 = c("002", "002", "002"), 
                  diagnosis3 = c("003", "005", "004"),
                  stringsAsFactors = FALSE)

dat
  id diagnosis1 diagnosis2 diagnosis3
1  1        001        002        003
2  2        001        002        005
3  3        005        002        004

# reshape long
long_dat <- gather(dat, key = diagnosis_code, value = string, -id)
head(long_dat)
  id diagnosis_code string
1  1     diagnosis1    001
2  2     diagnosis1    001
3  3     diagnosis1    005
4  1     diagnosis2    002
5  2     diagnosis2    002
6  3     diagnosis2    002

long_dat <- long_dat %>% 
  mutate(has_001 = ifelse(string == "001", 1, 0),
         has_4_or_5 = ifelse(string %in% c("004", "005"), 1, 0)
         )

# reduce to single line per id
long_dat <- long_dat %>%
  group_by(id) %>% 
  summarise(has_001 = max(has_001), 
            has_4_or_5 = max(has_4_or_5))
long_dat
# A tibble: 3 x 3
     id has_001 has_4_or_5
  <dbl>   <dbl>      <dbl>
1     1       1          0
2     2       1          1
3     3       0          1

dat <- left_join(dat, long_dat)
dat
  id diagnosis1 diagnosis2 diagnosis3 has_001 has_4_or_5
1  1        001        002        003       1          0
2  2        001        002        005       1          1
3  3        005        002        004       0          1

Answer 2

我就是这样编码的。

为 1 的 ICD 9 代码 [DX1 到 DX25] 中的不同疾病创建新列：DTV [深静脉血栓形成]。

y <- xs[paste0('DX', 1:25)] 
y[] <- as.matrix(y) %in% c("4532", "4533", "45340", "45341", "45342", "45382", "45383", "45384", "45385", "45386", "45387", "45388", "45389", "4539") 
xs$DTV <- 1 * (rowSums(y) > 0)

使用 dplyr 重新编码变量

Re-coding a Variable using dplyr

r

dplyr

tidyverse