从 R 中具有不同行长度的数据框创建关联矩阵
Creating an incidence matrix from data frame with distinct row lengths in R
我有一个 space 分隔文件,类似于:
# 1_1 AAA ABA AAB BBA
# 1_2 ABA AAA ABB BAA
# 1_3 ABA BAA
# 1_4 AAA BBB ABB
但是,行的内容(我们称它们为“单词”)包含更多字符(大约 20 个)。每行中的单词数各不相同(从少于 10 个到几十个)。我想使用 R 创建一个关联矩阵,其中每个唯一的单词都是一列,行名称将保留为行名称,如下所示:
# AAA AAB ABA ABB BAA BBA BBB
# 1_1 1 1 1 0 0 1 0
# 1_2 1 0 1 1 1 0 0
# 1_3 0 0 1 0 1 0 0
# 1_4 1 0 0 1 0 0 1
列顺序不相关。非常感谢任何建议,欢迎使用基础解决方案和包解决方案。谢谢!
tidyverse
library(tidyverse)
df <-
structure(
list(
id = c("1_1", "1_2", "1_3", "1_4"),
col_1 = c("AAA", "ABA", "ABA", "AAA"),
col_2 = c("ABA", "AAA", "BAA", "BBB"),
col_3 = c("AAB", "ABB", NA, "ABB"),
col_4 = c("BBA", "BAA", NA, NA)
),
class = "data.frame",
row.names = c(NA,-4L)
)
df %>%
pivot_longer(-id, values_drop_na = TRUE) %>%
pivot_wider(
id_cols = id,
names_from = value,
values_from = value,
values_fn = length,
values_fill = 0
)
#> # A tibble: 4 x 8
#> id AAA ABA AAB BBA ABB BAA BBB
#> <chr> <int> <int> <int> <int> <int> <int> <int>
#> 1 1_1 1 1 1 1 0 0 0
#> 2 1_2 1 1 0 0 1 1 0
#> 3 1_3 0 1 0 0 0 1 0
#> 4 1_4 1 0 0 0 1 0 1
由 reprex package (v2.0.1)
于 2021-12-13 创建
data.table
library(data.table)
library(magrittr)
setDT(df)
melt(df, id.vars = "id") %>%
na.omit() %>%
dcast(formula = id ~ value, fun.aggregate = length)
#> id AAA AAB ABA ABB BAA BBA BBB
#> 1: 1_1 1 1 1 0 0 1 0
#> 2: 1_2 1 0 1 1 1 0 0
#> 3: 1_3 0 0 1 0 1 0 0
#> 4: 1_4 1 0 0 1 0 0 1
由 reprex package (v2.0.1)
于 2021-12-13 创建
基础
df_stack <- stack(df, select = -id)
df_stack$id <- rep(df$id, ncol(df) - 1)
+(with(df_stack, table(id, values)))
#> values
#> id AAA AAB ABA ABB BAA BBA BBB
#> 1_1 1 1 1 0 0 1 0
#> 1_2 1 0 1 1 1 0 0
#> 1_3 0 0 1 0 1 0 0
#> 1_4 1 0 0 1 0 0 1
由 reprex package (v2.0.1)
于 2021-12-13 创建
试试下面的代码
> +(t(table(stack(as.data.frame(t(`row.names<-`(df[-1], df$id)))))) > 0)
values
ind AAA AAB ABA ABB BAA BBA BBB
1_1 1 1 1 0 0 1 0
1_2 1 0 1 1 1 0 0
1_3 0 0 1 0 1 0 0
1_4 1 0 0 1 0 0 1
选项unite/cSplit_e
library(dplyr)
library(tidyr)
library(splitstackshape)
library(stringr)
df %>%
unite(new, starts_with('col'), na.rm = TRUE) %>%
cSplit_e(., "new", sep="_", type = "character", fill = 0, drop = TRUE) %>%
rename_with(~ str_remove(., "new_"), -id)
id AAA AAB ABA ABB BAA BBA BBB
1 1_1 1 1 1 0 0 1 0
2 1_2 1 0 1 1 1 0 0
3 1_3 0 0 1 0 1 0 0
4 1_4 1 0 0 1 0 0 1
我有一个 space 分隔文件,类似于:
# 1_1 AAA ABA AAB BBA
# 1_2 ABA AAA ABB BAA
# 1_3 ABA BAA
# 1_4 AAA BBB ABB
但是,行的内容(我们称它们为“单词”)包含更多字符(大约 20 个)。每行中的单词数各不相同(从少于 10 个到几十个)。我想使用 R 创建一个关联矩阵,其中每个唯一的单词都是一列,行名称将保留为行名称,如下所示:
# AAA AAB ABA ABB BAA BBA BBB
# 1_1 1 1 1 0 0 1 0
# 1_2 1 0 1 1 1 0 0
# 1_3 0 0 1 0 1 0 0
# 1_4 1 0 0 1 0 0 1
列顺序不相关。非常感谢任何建议,欢迎使用基础解决方案和包解决方案。谢谢!
tidyverse
library(tidyverse)
df <-
structure(
list(
id = c("1_1", "1_2", "1_3", "1_4"),
col_1 = c("AAA", "ABA", "ABA", "AAA"),
col_2 = c("ABA", "AAA", "BAA", "BBB"),
col_3 = c("AAB", "ABB", NA, "ABB"),
col_4 = c("BBA", "BAA", NA, NA)
),
class = "data.frame",
row.names = c(NA,-4L)
)
df %>%
pivot_longer(-id, values_drop_na = TRUE) %>%
pivot_wider(
id_cols = id,
names_from = value,
values_from = value,
values_fn = length,
values_fill = 0
)
#> # A tibble: 4 x 8
#> id AAA ABA AAB BBA ABB BAA BBB
#> <chr> <int> <int> <int> <int> <int> <int> <int>
#> 1 1_1 1 1 1 1 0 0 0
#> 2 1_2 1 1 0 0 1 1 0
#> 3 1_3 0 1 0 0 0 1 0
#> 4 1_4 1 0 0 0 1 0 1
由 reprex package (v2.0.1)
于 2021-12-13 创建data.table
library(data.table)
library(magrittr)
setDT(df)
melt(df, id.vars = "id") %>%
na.omit() %>%
dcast(formula = id ~ value, fun.aggregate = length)
#> id AAA AAB ABA ABB BAA BBA BBB
#> 1: 1_1 1 1 1 0 0 1 0
#> 2: 1_2 1 0 1 1 1 0 0
#> 3: 1_3 0 0 1 0 1 0 0
#> 4: 1_4 1 0 0 1 0 0 1
由 reprex package (v2.0.1)
于 2021-12-13 创建基础
df_stack <- stack(df, select = -id)
df_stack$id <- rep(df$id, ncol(df) - 1)
+(with(df_stack, table(id, values)))
#> values
#> id AAA AAB ABA ABB BAA BBA BBB
#> 1_1 1 1 1 0 0 1 0
#> 1_2 1 0 1 1 1 0 0
#> 1_3 0 0 1 0 1 0 0
#> 1_4 1 0 0 1 0 0 1
由 reprex package (v2.0.1)
于 2021-12-13 创建试试下面的代码
> +(t(table(stack(as.data.frame(t(`row.names<-`(df[-1], df$id)))))) > 0)
values
ind AAA AAB ABA ABB BAA BBA BBB
1_1 1 1 1 0 0 1 0
1_2 1 0 1 1 1 0 0
1_3 0 0 1 0 1 0 0
1_4 1 0 0 1 0 0 1
选项unite/cSplit_e
library(dplyr)
library(tidyr)
library(splitstackshape)
library(stringr)
df %>%
unite(new, starts_with('col'), na.rm = TRUE) %>%
cSplit_e(., "new", sep="_", type = "character", fill = 0, drop = TRUE) %>%
rename_with(~ str_remove(., "new_"), -id)
id AAA AAB ABA ABB BAA BBA BBB
1 1_1 1 1 1 0 0 1 0
2 1_2 1 0 1 1 1 0 0
3 1_3 0 0 1 0 1 0 0
4 1_4 1 0 0 1 0 0 1