顶级类别、NA 的一种热编码,其余归入 R 中的 'others'
One Hot Encoding for top categories, NA, and remaining subsumed as 'others' in R
我只想对顶级类别和 NA 和 'others' 的变量进行热编码。
所以在这个简化的例子中,热编码 b where freq > 1 and NA:
id <- c(1, 2, 3, 4, 5, 6)
b <- c(NA, "A", "C", "A", "B", "C")
c <- c(2, 3, 6, NA, 4, 7)
df <- data.frame(id, b, c)
id b c
1 1 <NA> 2
2 2 A 3
3 3 C 6
4 4 A NA
5 5 B 4
6 6 C 7
table <- as.data.frame(table(df$b))
Var1 Freq
1 A 2
2 B 1
3 C 2
table_top <- table[table$Freq > 1,]
Var1 Freq
1 A 2
3 C 2
现在,我想要这样的东西
id b_NA c b_A b_C b_Others
1 1 2 0 0 0
2 0 3 1 0 0
3 0 6 0 1 0
4 0 NA 1 0 0
5 0 4 0 0 1
6 0 7 0 1 0
我试过子集 df
table_top <- as.vector(table_top$Var1)
table_only_top <- subset(df, b %in% table_top)
table_only_top
a b c
2 1 A 3
3 2 C 6
4 2 A NA
6 3 C 7
但是,现在我不知道如何获得输出。在我的真实数据中,我的类别比此处多得多,因此无法使用输出中的名称。我的真实输出中的其他类别也存在很多类别。
非常感谢任何提示:)
绝对不是一个优雅的解决方案,但它应该有效:
library(tideverse)
library(reshape2)
df %>%
gather(var, val, -id) %>%
add_count(var, val) %>%
mutate(res = ifelse(var == "b" & n > 1, 1, 0),
val = paste("b_", val, sep = "")) %>%
filter(var == "b" & n != 1) %>%
dcast(id ~ val, value.var = "res") %>%
full_join(df, by = c("id" = "id")) %>%
mutate(b_NA = ifelse(is.na(b), 1, 0)) %>%
mutate_at(vars(contains("b_")), funs(replace(., is.na(.), 0))) %>%
mutate(b_OTHERS = ifelse(rowSums(.[grep("b_", names(.))]) != 0, 0, 1))
id b_A b_C b c b_NA b_OTHERS
1 2 1 0 A 3 0 0
2 3 0 1 C 6 0 0
3 4 1 0 A NA 0 0
4 6 0 1 C 7 0 0
5 1 0 0 <NA> 2 1 0
6 5 0 0 B 4 0 1
您可以 cbind
data.frames 根据您的不同标准。
# simple conditions -------------------------------------------------------
df <- df_orig[,-1]
df_na <- is.na(df)
colnames(df_na) <- paste0(colnames(df),"_NA")
df_A <- df=="A"
colnames(df_A) <- paste0(colnames(df),"_A")
df_C <- df=="C"
colnames(df_C) <- paste0(colnames(df),"_C")
# for counts you can use sapply with one loop -----------------------------
df_counts <- df
for(j in 1:ncol(df)) {
counts <- sapply(1:nrow(df), function(x) sum(df[x,j]==df[,j], na.rm=T) )
df_counts[,j] <- counts
}
df_counts <- df
# or avoid explicit loops altogether --------------------------------------
df_counts2 <- sapply(1:ncol(df), function(y) sapply(1:nrow(df), function(x) sum(df[x,y]==df[,y], na.rm=T) ) )
colnames(df_counts2 ) <- paste0(colnames(df),"_counts")
# cbind df's -------------------------------------------------------------
df_full <- cbind(df_orig, df_na, df_A, df_C, df_counts2)
# check if frequency greater then 1 or NA ---------------------------------
df_full$result <- df_full[,10:11] >=2 | df_full[,4:5]
df_full
比较难的部分是我想计算频率,这里我包括两种方法。结果是:
id b c b_NA c_NA b_A c_A b_C c_C b_counts c_counts result.b_NA result.c_NA
1 1 <NA> 2 FALSE FALSE FALSE FALSE FALSE FALSE 1 1 FALSE FALSE
2 2 A 3 FALSE FALSE TRUE FALSE FALSE FALSE 2 1 TRUE FALSE
3 3 C 6 FALSE FALSE FALSE FALSE TRUE FALSE 2 1 TRUE FALSE
4 4 A NA FALSE TRUE TRUE NA FALSE NA 2 0 TRUE TRUE
5 5 B 4 FALSE FALSE FALSE FALSE FALSE FALSE 1 1 FALSE FALSE
6 6 C 7 FALSE FALSE FALSE FALSE TRUE FALSE 2 1 TRUE FALSE
您可以根据您的条件修改列。希望有帮助
data.table
和 mltools
快速而性感:
> one_hot(dt, naCols = TRUE, sparsifyNAs = TRUE)
id cat_NA cat_A cat_C cat_Others freq
1: 1 1 0 0 0 2
2: 2 0 1 0 0 3
3: 3 0 0 1 0 6
4: 4 0 1 0 0 NA
5: 5 0 0 0 1 4
6: 6 0 0 1 0 7
代码
加载库
library(dplyr)
library(data.table)
library(mltools)
转换数据
# Kick out all with freq == 1 and below
df <- df %>%
# Group by variables that will be onehotted
group_by(cat) %>%
# Add a count per group item column
mutate(count = n()) %>%
# Ungroup for next steps
ungroup() %>%
# Change all that have a count of 1 or below to "Others".
# If cat was a factor, we would get numeric results at this step.
mutate(cat = ifelse(!is.na(cat) & count <= 1, "Others", cat),
# Only now we turn it into a factor for the one_hot function
cat = as.factor(cat)) %>%
# Drop the count column
select(id, cat, freq)
# Turn into data.table
dt <- as.data.table(df)
检查中间结果
> dt
id cat freq
1: 1 <NA> 2
2: 2 A 3
3: 3 C 6
4: 4 A NA
5: 5 Others 4
6: 6 C 7
数据
id <- c(1, 2, 3, 4, 5, 6)
cat <- c(NA, "A", "C", "A", "B", "C")
freq <- c(2, 3, 6, NA, 4, 7)
# It is important to have no other factor variables other
# than the variable(s) you one want to one hot. For that reason
# the automatic factoring is turned off.
df <- data.frame(id, cat, freq,
stringsAsFactors = FALSE)
> df
id cat freq
1 1 <NA> 2
2 2 A 3
3 3 C 6
4 4 A NA
5 5 B 4
6 6 C 7
我只想对顶级类别和 NA 和 'others' 的变量进行热编码。
所以在这个简化的例子中,热编码 b where freq > 1 and NA:
id <- c(1, 2, 3, 4, 5, 6)
b <- c(NA, "A", "C", "A", "B", "C")
c <- c(2, 3, 6, NA, 4, 7)
df <- data.frame(id, b, c)
id b c
1 1 <NA> 2
2 2 A 3
3 3 C 6
4 4 A NA
5 5 B 4
6 6 C 7
table <- as.data.frame(table(df$b))
Var1 Freq
1 A 2
2 B 1
3 C 2
table_top <- table[table$Freq > 1,]
Var1 Freq
1 A 2
3 C 2
现在,我想要这样的东西
id b_NA c b_A b_C b_Others
1 1 2 0 0 0
2 0 3 1 0 0
3 0 6 0 1 0
4 0 NA 1 0 0
5 0 4 0 0 1
6 0 7 0 1 0
我试过子集 df
table_top <- as.vector(table_top$Var1)
table_only_top <- subset(df, b %in% table_top)
table_only_top
a b c
2 1 A 3
3 2 C 6
4 2 A NA
6 3 C 7
但是,现在我不知道如何获得输出。在我的真实数据中,我的类别比此处多得多,因此无法使用输出中的名称。我的真实输出中的其他类别也存在很多类别。
非常感谢任何提示:)
绝对不是一个优雅的解决方案,但它应该有效:
library(tideverse)
library(reshape2)
df %>%
gather(var, val, -id) %>%
add_count(var, val) %>%
mutate(res = ifelse(var == "b" & n > 1, 1, 0),
val = paste("b_", val, sep = "")) %>%
filter(var == "b" & n != 1) %>%
dcast(id ~ val, value.var = "res") %>%
full_join(df, by = c("id" = "id")) %>%
mutate(b_NA = ifelse(is.na(b), 1, 0)) %>%
mutate_at(vars(contains("b_")), funs(replace(., is.na(.), 0))) %>%
mutate(b_OTHERS = ifelse(rowSums(.[grep("b_", names(.))]) != 0, 0, 1))
id b_A b_C b c b_NA b_OTHERS
1 2 1 0 A 3 0 0
2 3 0 1 C 6 0 0
3 4 1 0 A NA 0 0
4 6 0 1 C 7 0 0
5 1 0 0 <NA> 2 1 0
6 5 0 0 B 4 0 1
您可以 cbind
data.frames 根据您的不同标准。
# simple conditions -------------------------------------------------------
df <- df_orig[,-1]
df_na <- is.na(df)
colnames(df_na) <- paste0(colnames(df),"_NA")
df_A <- df=="A"
colnames(df_A) <- paste0(colnames(df),"_A")
df_C <- df=="C"
colnames(df_C) <- paste0(colnames(df),"_C")
# for counts you can use sapply with one loop -----------------------------
df_counts <- df
for(j in 1:ncol(df)) {
counts <- sapply(1:nrow(df), function(x) sum(df[x,j]==df[,j], na.rm=T) )
df_counts[,j] <- counts
}
df_counts <- df
# or avoid explicit loops altogether --------------------------------------
df_counts2 <- sapply(1:ncol(df), function(y) sapply(1:nrow(df), function(x) sum(df[x,y]==df[,y], na.rm=T) ) )
colnames(df_counts2 ) <- paste0(colnames(df),"_counts")
# cbind df's -------------------------------------------------------------
df_full <- cbind(df_orig, df_na, df_A, df_C, df_counts2)
# check if frequency greater then 1 or NA ---------------------------------
df_full$result <- df_full[,10:11] >=2 | df_full[,4:5]
df_full
比较难的部分是我想计算频率,这里我包括两种方法。结果是:
id b c b_NA c_NA b_A c_A b_C c_C b_counts c_counts result.b_NA result.c_NA
1 1 <NA> 2 FALSE FALSE FALSE FALSE FALSE FALSE 1 1 FALSE FALSE
2 2 A 3 FALSE FALSE TRUE FALSE FALSE FALSE 2 1 TRUE FALSE
3 3 C 6 FALSE FALSE FALSE FALSE TRUE FALSE 2 1 TRUE FALSE
4 4 A NA FALSE TRUE TRUE NA FALSE NA 2 0 TRUE TRUE
5 5 B 4 FALSE FALSE FALSE FALSE FALSE FALSE 1 1 FALSE FALSE
6 6 C 7 FALSE FALSE FALSE FALSE TRUE FALSE 2 1 TRUE FALSE
您可以根据您的条件修改列。希望有帮助
data.table
和 mltools
快速而性感:
> one_hot(dt, naCols = TRUE, sparsifyNAs = TRUE)
id cat_NA cat_A cat_C cat_Others freq
1: 1 1 0 0 0 2
2: 2 0 1 0 0 3
3: 3 0 0 1 0 6
4: 4 0 1 0 0 NA
5: 5 0 0 0 1 4
6: 6 0 0 1 0 7
代码
加载库library(dplyr)
library(data.table)
library(mltools)
转换数据
# Kick out all with freq == 1 and below
df <- df %>%
# Group by variables that will be onehotted
group_by(cat) %>%
# Add a count per group item column
mutate(count = n()) %>%
# Ungroup for next steps
ungroup() %>%
# Change all that have a count of 1 or below to "Others".
# If cat was a factor, we would get numeric results at this step.
mutate(cat = ifelse(!is.na(cat) & count <= 1, "Others", cat),
# Only now we turn it into a factor for the one_hot function
cat = as.factor(cat)) %>%
# Drop the count column
select(id, cat, freq)
# Turn into data.table
dt <- as.data.table(df)
检查中间结果
> dt
id cat freq
1: 1 <NA> 2
2: 2 A 3
3: 3 C 6
4: 4 A NA
5: 5 Others 4
6: 6 C 7
数据
id <- c(1, 2, 3, 4, 5, 6)
cat <- c(NA, "A", "C", "A", "B", "C")
freq <- c(2, 3, 6, NA, 4, 7)
# It is important to have no other factor variables other
# than the variable(s) you one want to one hot. For that reason
# the automatic factoring is turned off.
df <- data.frame(id, cat, freq,
stringsAsFactors = FALSE)
> df
id cat freq
1 1 <NA> 2
2 2 A 3
3 3 C 6
4 4 A NA
5 5 B 4
6 6 C 7