将列值转换成它们自己的二进制编码列（虚拟变量）

Question

我有很多 CSV 文件，其中包含性别、年龄、诊断等列。

目前，它们的编码如下：

ID, gender, age, diagnosis
1,  male,   42,  asthma
1,  male,   42,  anxiety
2,  male,   19,  asthma
3,  female, 23,  diabetes
4,  female, 61,  diabetes
4,  female, 61,  copd

目标是将此数据转换成此目标格式:

旁注：如果可能的话，最好也将原始列名添加到新列名之前，例如'age_42' 或 'gender_female.'

ID, male, female, 42, 19, 23, 61, asthma, anxiety, diabetes, copd
1,  1,    0,      1,  0,  0,  0,  1,      1,       0,        0
2,  1,    0,      0,  1,  0,  0,  1,      0,       0,        0
3,  0,    1,      0,  0,  1,  0,  0,      0,       1,        0
4,  0,    1,      0,  0,  0,  1,  0,      0,       1,        1

我尝试使用 reshape2 的 dcast() 函数，但得到的组合导致矩阵极其稀疏。这是一个仅包含年龄和性别的简化示例：

data.train  <- dcast(data.raw, formula = id ~ gender + age, fun.aggregate = length)

ID, male19, male23, male42, male61, female19, female23, female42, female61
1,  0,      0,      1,      0,      0,        0,        0,        0
2,  1,      0,      0,      0,      0,        0,        0,        0
3,  0,      0,      0,      0,      0,        1,        0,        0
4,  0,      0,      0,      0,      0,        0,        0,        1

鉴于这是机器学习数据准备中相当常见的任务，我想可能还有其他库（我不知道）能够执行此转换。

Answer 1

您需要一个 melt/dcast 组合（称为 recast），以便将所有列转换为一列并避免组合

library(reshape2)
recast(df, ID ~ value, id.var = 1, fun.aggregate = function(x) (length(x) > 0) + 0L)
#   ID 19 23 42 61 anxiety asthma copd diabetes female male
# 1  1  0  0  1  0       1      1    0        0      0    1
# 2  2  1  0  0  0       0      1    0        0      0    1
# 3  3  0  1  0  0       0      0    0        1      1    0
# 4  4  0  0  0  1       0      0    1        1      1    0

根据您的旁注，您可以在此处添加 variable 以便也添加名称

recast(df, ID ~ variable + value, id.var = 1, fun.aggregate = function(x) (length(x) > 0) + 0L)
#   ID gender_female gender_male age_19 age_23 age_42 age_61 diagnosis_anxiety diagnosis_asthma diagnosis_copd
# 1  1             0           1      0      0      1      0                 1                1              0
# 2  2             0           1      1      0      0      0                 0                1              0
# 3  3             1           0      0      1      0      0                 0                0              0
# 4  4             1           0      0      0      0      1                 0                0              1
#   diagnosis_diabetes
# 1                  0
# 2                  0
# 3                  1
# 4                  1

Answer 2

下面是 dcast() 和 merge() 的稍微长一点的方法。由于 ID 的性别和年龄不是唯一的，因此创建了一个函数以将其长度转换为虚拟变量 (dum())。另一方面，通过调整公式，将诊断设置为唯一统计。

library(reshape2)
data.raw <- read.table(header = T, sep = ",", text = "
id, gender, age, diagnosis
1,  male,   42,  asthma
1,  male,   42,  anxiety
2,  male,   19,  asthma
3,  female, 23,  diabetes
4,  female, 61,  diabetes
4,  female, 61,  copd")

# function to create a dummy variable
dum <- function(x) { if(length(x) > 0) 1 else 0 }

# length of dignosis by id, gender and age
diag <- dcast(data.raw, formula = id + gender + age ~ diagnosis, fun.aggregate = length)[,-c(2,3)]

# length of gender by id
gen <- dcast(data.raw, formula = id ~ gender, fun.aggregate = dum)

# length of age by id
age <- dcast(data.raw, formula = id ~ age, fun.aggregate = dum)

merge(merge(gen, age, by = "id"), diag, by = "id")
#  id   female   male 19 23 42 61   anxiety   asthma   copd   diabetes
#1  1        0      1  0  0  1  0         1        1      0          0
#2  2        0      1  1  0  0  0         0        1      0          0
#3  3        1      0  0  1  0  0         0        0      0          1
#4  4        1      0  0  0  0  1         0        0      1          1

实际上我不太了解你的模型，但你的设置可能太多了，因为 R 通过公式对象处理因子。比如性别是response，R内部会生成如下矩阵。所以，只要不打算自己拟合，适当设置数据类型和公式就可以了。

data.raw$age <- as.factor(data.raw$age)
model.matrix(gender ~ ., data = data.raw[,-1])
#(Intercept) age23 age42 age61 diagnosis  asthma diagnosis  copd diagnosis  diabetes
#1           1     0     1     0                 1               0                   0
#2           1     0     1     0                 0               0                   0
#3           1     0     0     0                 1               0                   0
#4           1     1     0     0                 0               0                   1
#5           1     0     0     1                 0               0                   1
#6           1     0     0     1                 0               1                   0

如果您需要每个变量的所有级别，您可以通过抑制 model.matrix 中的拦截并使用 all-levels-of-a-factor-in-a-model-matrix-in-r

中的小技巧来实现

#  Using Akrun's df1, first change all variables, except ID, to factor
df1[-1] <- lapply(df1[-1], factor)

# Use model.matrix to derive dummy coding
m <- data.frame(model.matrix( ~ 0 + . , data=df1, 
             contrasts.arg = lapply(df1[-1], contrasts, contrasts=FALSE)))

# Collapse to give final solution
aggregate(. ~ ID, data=m, max)

Answer 3

使用来自基础 R 的 reshape：

d <- reshape(df, idvar="ID", timevar="diagnosis", direction="wide", v.names="diagnosis", sep="_")
a <- reshape(df, idvar="ID", timevar="age", direction="wide", v.names="age", sep="_")
g <- reshape(df, idvar="ID", timevar="gender", direction="wide", v.names="gender", sep="_")


new.dat <- cbind(ID=d["ID"],
    g[,grepl("_", names(g))],
    a[,grepl("_", names(a))],
    d[,grepl("_", names(d))])

# convert factors columns to character (if necessary)
# taken from @Marek's answer here: 
new.dat[sapply(new.dat, is.factor)] <- lapply(new.dat[sapply(new.dat, is.factor)], as.character)

new.dat[which(is.na(new.dat), arr.ind=TRUE)] <- 0
new.dat[-1][which(new.dat[-1] != 0, arr.ind=TRUE)] <- 1

#  ID gender_male gender_female age_42 age_19 age_23 age_61 diagnosis_asthma
#1  1           1             0      1      0      0      0                1
#3  2           1             0      0      1      0      0                1
#4  3           0             1      0      0      1      0                0
#5  4           0             1      0      0      0      1                0
#  diagnosis_anxiety diagnosis_diabetes diagnosis_copd
#1                 1                  0              0
#3                 0                  0              0
#4                 0                  1              0
#5                 0                  1              1

Answer 4

caret 包中有一个函数到 "dummify" 数据。

library(caret)
library(dplyr)
predict(dummyVars(~ ., data = mutate_each(df, funs(as.factor))), newdata = df)

Answer 5

一个base R选项是

 (!!table(cbind(df1[1],stack(df1[-1])[-2])))*1L
 #     values
 #ID  19 23 42 61 anxiety asthma copd diabetes female male
 # 1  0  0  1  0       1      1    0        0      0    1
 # 2  1  0  0  0       0      1    0        0      0    1
 # 3  0  1  0  0       0      0    0        1      1    0
 # 4  0  0  0  1       0      0    1        1      1    0

如果你也需要原名

 (!!table(cbind(df1[1],Val=do.call(paste, c(stack(df1[-1])[2:1], sep="_")))))*1L
 #   Val
 #ID  age_19 age_23 age_42 age_61 diagnosis_anxiety diagnosis_asthma
 #1      0      0      1      0                 1                1
 #2      1      0      0      0                 0                1
 #3      0      1      0      0                 0                0
 #4      0      0      0      1                 0                0
 #  Val
 #ID  diagnosis_copd diagnosis_diabetes gender_female gender_male
 #1              0                  0             0           1
 #2              0                  0             0           1
 #3              0                  1             1           0
 #4              1                  1             1           0

数据

df1 <- structure(list(ID = c(1L, 1L, 2L, 3L, 4L, 4L), gender = c("male", 
"male", "male", "female", "female", "female"), age = c(42L, 42L, 
19L, 23L, 61L, 61L), diagnosis = c("asthma", "anxiety", "asthma", 
"diabetes", "diabetes", "copd")), .Names = c("ID", "gender", 
"age", "diagnosis"), row.names = c(NA, -6L), class = "data.frame")

将列值转换成它们自己的二进制编码列（虚拟变量）

Converting Column Values into Their Own Binary Encoded Columns (Dummy Variables)

r

sparse-matrix

reshape2

数据