使用 dplyr 重新编码变量
Re-coding a Variable using dplyr
我的数据集中有肺栓塞的 ICD 9 代码,从 DX1 列到 DX25 列,我想在我的数据集 (xs) 中使用 ICD 9 代码为肺栓塞 (PE) 创建一个单独的列。因此,如果 ICD 9 代码存在于从 DX1 到 DX25 的任何地方,我想将其编码为 1 else 0。我也不想使用 for 循环,因为我的数据集有 30,000 行并且数据集为 7GB,所以它使我的计算机崩溃.
目前我正在使用以下代码执行此操作:
xs$PE = NA
xs[which(xs$DX1%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX2%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX3%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX4%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX5%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX6%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX7%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX8%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX9%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX10%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX11%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX12%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX13%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX14%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX15%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX16%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX17%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX18%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX19%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX20%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX21%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX22%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX23%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX24%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX25%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs <- xs %>%
mutate(PE = ifelse(is.na(PE),0,PE))
谁能用更聪明的方法帮助我使用 dplyr 重新编码?[我不想输入 100 行代码,因为我还有其他诊断需要编码并创建新列]
当我过去做过类似的工作时,我结合使用了 tidyr 和 dplyr。
我的第一步是重塑长,然后寻找诊断。
然后可以将数据集减少为每个 id 一行。
如果需要,可以将其合并回原始数据集。
但是,我还没有对像您这样大的数据集进行过尝试。
library(tidyr)
library(dplyr)
dat <- data.frame(id = c(1,2,3),
diagnosis1 = c("001", "001", "005"),
diagnosis2 = c("002", "002", "002"),
diagnosis3 = c("003", "005", "004"),
stringsAsFactors = FALSE)
dat
id diagnosis1 diagnosis2 diagnosis3
1 1 001 002 003
2 2 001 002 005
3 3 005 002 004
# reshape long
long_dat <- gather(dat, key = diagnosis_code, value = string, -id)
head(long_dat)
id diagnosis_code string
1 1 diagnosis1 001
2 2 diagnosis1 001
3 3 diagnosis1 005
4 1 diagnosis2 002
5 2 diagnosis2 002
6 3 diagnosis2 002
long_dat <- long_dat %>%
mutate(has_001 = ifelse(string == "001", 1, 0),
has_4_or_5 = ifelse(string %in% c("004", "005"), 1, 0)
)
# reduce to single line per id
long_dat <- long_dat %>%
group_by(id) %>%
summarise(has_001 = max(has_001),
has_4_or_5 = max(has_4_or_5))
long_dat
# A tibble: 3 x 3
id has_001 has_4_or_5
<dbl> <dbl> <dbl>
1 1 1 0
2 2 1 1
3 3 0 1
dat <- left_join(dat, long_dat)
dat
id diagnosis1 diagnosis2 diagnosis3 has_001 has_4_or_5
1 1 001 002 003 1 0
2 2 001 002 005 1 1
3 3 005 002 004 0 1
我就是这样编码的。
为 1 的 ICD 9 代码 [DX1 到 DX25] 中的不同疾病创建新列:DTV [深静脉血栓形成]。
y <- xs[paste0('DX', 1:25)]
y[] <- as.matrix(y) %in% c("4532", "4533", "45340", "45341", "45342", "45382", "45383", "45384", "45385", "45386", "45387", "45388", "45389", "4539")
xs$DTV <- 1 * (rowSums(y) > 0)
我的数据集中有肺栓塞的 ICD 9 代码,从 DX1 列到 DX25 列,我想在我的数据集 (xs) 中使用 ICD 9 代码为肺栓塞 (PE) 创建一个单独的列。因此,如果 ICD 9 代码存在于从 DX1 到 DX25 的任何地方,我想将其编码为 1 else 0。我也不想使用 for 循环,因为我的数据集有 30,000 行并且数据集为 7GB,所以它使我的计算机崩溃. 目前我正在使用以下代码执行此操作:
xs$PE = NA
xs[which(xs$DX1%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX2%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX3%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX4%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX5%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX6%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX7%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX8%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX9%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX10%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX11%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX12%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX13%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX14%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX15%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX16%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX17%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX18%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX19%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX20%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX21%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX22%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX23%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX24%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs[which(xs$DX25%in% c("41511", "41512", "41513", "41519")), "PE"] <- 1
xs <- xs %>%
mutate(PE = ifelse(is.na(PE),0,PE))
谁能用更聪明的方法帮助我使用 dplyr 重新编码?[我不想输入 100 行代码,因为我还有其他诊断需要编码并创建新列]
当我过去做过类似的工作时,我结合使用了 tidyr 和 dplyr。
我的第一步是重塑长,然后寻找诊断。 然后可以将数据集减少为每个 id 一行。 如果需要,可以将其合并回原始数据集。 但是,我还没有对像您这样大的数据集进行过尝试。
library(tidyr)
library(dplyr)
dat <- data.frame(id = c(1,2,3),
diagnosis1 = c("001", "001", "005"),
diagnosis2 = c("002", "002", "002"),
diagnosis3 = c("003", "005", "004"),
stringsAsFactors = FALSE)
dat
id diagnosis1 diagnosis2 diagnosis3
1 1 001 002 003
2 2 001 002 005
3 3 005 002 004
# reshape long
long_dat <- gather(dat, key = diagnosis_code, value = string, -id)
head(long_dat)
id diagnosis_code string
1 1 diagnosis1 001
2 2 diagnosis1 001
3 3 diagnosis1 005
4 1 diagnosis2 002
5 2 diagnosis2 002
6 3 diagnosis2 002
long_dat <- long_dat %>%
mutate(has_001 = ifelse(string == "001", 1, 0),
has_4_or_5 = ifelse(string %in% c("004", "005"), 1, 0)
)
# reduce to single line per id
long_dat <- long_dat %>%
group_by(id) %>%
summarise(has_001 = max(has_001),
has_4_or_5 = max(has_4_or_5))
long_dat
# A tibble: 3 x 3
id has_001 has_4_or_5
<dbl> <dbl> <dbl>
1 1 1 0
2 2 1 1
3 3 0 1
dat <- left_join(dat, long_dat)
dat
id diagnosis1 diagnosis2 diagnosis3 has_001 has_4_or_5
1 1 001 002 003 1 0
2 2 001 002 005 1 1
3 3 005 002 004 0 1
我就是这样编码的。
为 1 的 ICD 9 代码 [DX1 到 DX25] 中的不同疾病创建新列:DTV [深静脉血栓形成]。
y <- xs[paste0('DX', 1:25)]
y[] <- as.matrix(y) %in% c("4532", "4533", "45340", "45341", "45342", "45382", "45383", "45384", "45385", "45386", "45387", "45388", "45389", "4539")
xs$DTV <- 1 * (rowSums(y) > 0)