变换数据框

Transform data frame

我有一份问卷,其中包含一个开放式问题,例如 "Please name up to ten animals",它为我提供了以下数据框(其中每个字母代表一种动物):

nrow <- 1000
list <- vector("list", nrow)

for(i in 1:nrow){
  na <- rep(NA, sample(1:10, 1))
  list[[i]] <- sample(c(letters, na), 10, replace=FALSE)
}

df <- data.frame()
df <- rbind(df, do.call(rbind, list))

head(df)
# V1   V2 V3 V4   V5 V6   V7 V8 V9  V10
# 1  r <NA>  a  j    w  e    i  h  u    z
# 2  t    o  e  x    d  v <NA>  z  n    c
# 3  f    y  e  s    n  c    z  i  u    k
# 4  y <NA>  v  j    h  z    p  i  c    q
# 5  w    s  v  f <NA>  c    g  b  x    e
# 6  p <NA>  a  h    v  x    k  z  o <NA>

如何将此数据框转换为如下所示的数据框?请记住,我实际上并不知道列名。

 r <- 1000
 c <- length(letters)
 t1 <- matrix(rbinom(r*c,1,0.5),r,c)
 colnames(t1) <- letters
 head(t1)
 #      a b c d e f g h i j k l m n o p q r s t u v w x y z
 # [1,] 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0
 # [2,] 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 1 0 1 0 1
 # [3,] 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 # [4,] 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0
 # [5,] 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0
 # [6,] 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1
td <-  data.frame(t(apply(df, 1, function(x) as.numeric( unique(unlist(df)) %in% x))))
colnames (td) <- unique(unlist(df))

letters 可以替换为动物名称向量 colnames(t1).

您可以使用 tidyr 执行以下操作,这可能比其他方法快得多,但我非常喜欢@germcd 的方法。您可能需要修补 select,删除 NA 以及空白 space,这可能是您提供的模拟数据的产物:

require(tidyr)

##  Add an ID for each record:
df$id <- 1:nrow(df)

out <- (df %>% 
  gather(column, animal, -id) %>% 
  filter(animal != " ") %>% 
  spread(animal, column)
)

head(out)

此代码将未命名的列收集成长格式,删除任何空列或缺失数据,然后按 animal 列的唯一值展开。这也有可能需要 属性 保留动物命名的列顺序。如果不需要,那么您可以轻松地将生成的动物列转换为数字:

out_num <- out
out_num[,-1] <- as.numeric((!is.na(out[,-1])))
head(out_num)

您可以尝试 "qdapTools" 包中的 mtabulate

library(qdapTools)
head(mtabulate(as.data.frame(t(df))))
#   c d i l m o r v x y a f s t k p u b h j n q e g w z
# 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 0 1 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 3 0 0 1 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0
# 4 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0
# 5 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 0 0
# 6 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0

当然还有很多其他选择。

例如,我的 "splitstackshape" 包中的 cSplit_e(缺点是效率低下,您需要先将值粘贴在一起,然后才能拆分它们):

library(splitstackshape)
library(dplyr)

作为 1 和 0:

df %>%
  mutate(combined = apply(., 1, function(x) paste(na.omit(x), collapse = ","))) %>%
  cSplit_e("combined", ",", mode = "binary", type = "character", fill = 0) %>%
  select(starts_with("combined_")) %>%
  head
#   combined_a combined_b combined_c combined_d combined_e combined_f combined_g combined_h combined_i
# 1          0          0          1          1          0          0          0          0          1
# 2          1          0          0          1          0          1          0          0          0
# 3          1          0          0          0          0          0          0          0          1
# 4          0          1          1          0          0          0          0          1          1
# 5          0          1          0          1          0          0          0          1          0
# 6          0          1          0          0          0          0          0          0          0
#   combined_j combined_k combined_l combined_m combined_n combined_o combined_p combined_q combined_r
# 1          0          0          1          1          0          1          0          0          1
# 2          0          0          0          1          0          0          0          0          0
# 3          0          1          0          0          0          0          1          0          1
# 4          1          0          1          0          1          0          0          0          0
# 5          0          1          0          0          1          0          1          1          1
# 6          1          1          0          1          0          0          0          1          0
#   combined_s combined_t combined_u combined_v combined_w combined_x combined_y combined_z
# 1          0          0          0          1          0          1          1          0
# 2          1          1          0          0          0          0          0          0
# 3          0          1          1          0          0          1          1          0
# 4          0          0          1          0          0          0          1          0
# 5          1          0          0          0          0          0          0          0
# 6          1          1          1          0          0          0          0          0

作为原始值:

df %>%
  mutate(combined = apply(., 1, function(x) paste(na.omit(x), collapse = ","))) %>%
  cSplit_e("combined", ",", mode = "value", type = "character", fill = "") %>%
  select(starts_with("combined_")) %>%
  head
#   combined_a combined_b combined_c combined_d combined_e combined_f combined_g combined_h combined_i
# 1                                c          d                                                      i
# 2          a                                d                     f                                 
# 3          a                                                                                       i
# 4                     b          c                                                      h          i
# 5                     b                     d                                           h           
# 6                     b                                                                             
#   combined_j combined_k combined_l combined_m combined_n combined_o combined_p combined_q combined_r
# 1                                l          m                     o                                r
# 2                                           m                                                       
# 3                     k                                                      p                     r
# 4          j                     l                     n                                            
# 5                     k                                n                     p          q          r
# 6          j          k                     m                                           q           
#   combined_s combined_t combined_u combined_v combined_w combined_x combined_y combined_z
# 1                                           v                     x          y           
# 2          s          t                                                                  
# 3                     t          u                                x          y           
# 4                                u                                           y           
# 5          s                                                                             
# 6          s          t          u   

或者,您可以使用 "reshape2":

library(reshape2)

## The values
dcast(melt(as.matrix(df), na.rm = TRUE), 
      Var1 ~ value, value.var = "value")

## ones and zeroes
dcast(melt(as.matrix(df), na.rm = TRUE), 
      Var1 ~ value, value.var = "value", fun.aggregate = length)