将所有行粘贴在一起的最快方法

Fastest way to paste all rows together

我想将所有行按列粘贴到同一个单元格中

比如我有一个table如下:

library(tibble)

tibble::tribble(
  ~Col1, ~Col2, ~Col3,
  "AA",     "AA",    "AB",
  "AB",     "AB",    "BB",
  "BC",     "BB",    "AA"
  )
Col1  Col2  Col3

AA     AA    AB

AB     AB    BB
 
BC     BB    AA

我要的输出是一个3X1的table如下:

Col1 AAABBC

Col2 AAABBB

Col3 ABBBAA 

然而,实际情况更复杂,因为我的原始table有600,000行和2000列。我想知道实现这一目标的最快方法是什么。我尝试了循环,但它花了很长时间才完成按列粘贴行。

感谢任何帮助,谢谢!

library(data.table)

dt <- fread('Col1  Col2  Col3
AA     AA    AB
AB     AB    BB
BC     BB    AA')


transpose(dt)[,.(result=do.call(paste0,.SD))]
#>    result
#> 1: AAABBC
#> 2: AAABBB
#> 3: ABBBAA

#or


dt <- fread('Col1  Col2  Col3
AA     AA    AB
AB     AB    BB
BC     BB    AA')
transpose(dt[,paste0("new_cols",1:3) := lapply(.SD,paste,collapse="")][1,.SD,.SDcols = patterns("^new")])
#>        V1
#> 1: AAABBC
#> 2: AAABBB
#> 3: ABBBAA

reprex package (v0.3.0)

于 2021-03-18 创建

第二种方法应该比第一种方法快。

lapply(df, paste, collapse="")

这个returns一个列表。如果您想要矢量,请使用 sapply 而不是 lapply。如果您想要一个数据框,请将整个调用包装在 data.frame.

如果您有足够的内存来存储数据的多个实例,这种使用 doParallel 包的方法可能会奏效。这里我用的是tidyverse family.


library(tidyverse)
library(doParallel)

n <- 1000
# Generate a 1000 rows df with ~3000 columns
big_table <- do.call("rbind", replicate(n, data, simplify = FALSE))
lapply(1:10, function(x) {big_table <<- bind_cols(big_table, big_table); return(x)})

# Get the list of column names
col_list <- names(big_table)
# Define number of cores you want to process
number_of_parallel_cores <- 4
col_group <- split(col_list, sort(rep_len(1:number_of_parallel_cores, length(col_list))))

# Running the code with timer
system.time({
  registerDoParallel(number_of_parallel_cores)
  combine_data <- bind_rows(foreach(i_col_group = col_group) %dopar% {
    big_table %>%
      select(one_of(i_col_group)) %>%
      summarize(across(.fns = paste, collapse = "")) %>%
      pivot_longer(cols = everything(), names_to = "col_names", values_to = "values")
  })
})

时机

   user  system elapsed 
  1.291   0.291   0.898 

输出

   col_names values                                                                                                                                                                                                               
   <chr>     <chr>                                                                                                                                                                                                                
 1 Col1...1  AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
 2 Col2...2  AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
 3 Col3...3  ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
 4 Col1...4  AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
 5 Col2...5  AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
 6 Col3...6  ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
 7 Col1...7  AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
 8 Col2...8  AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
 9 Col3...9  ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
10 Col1...10 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
# … with 3,062 more rows

但是我发现,与并行设置相比,简单地将所有内容绑定在一起要快得多。猜猜这个操作的间接成本是不可行的

system.time(
  big_table %>%
    select(one_of(col_list)) %>%
    summarize(across(.fns = paste, collapse = "")) %>%
    pivot_longer(cols = everything(), names_to = "col_names", values_to = "values")
)

   user  system elapsed 
  0.021   0.000   0.022 

我们可以使用 collapse 中的 dapply,它针对行操作进行了优化

library(collapse)
dapply(df1, paste, collapse="", MARGIN = 1)
#[1] "AAAAAB" "ABABBB" "BCBBAA"

根据?dapply

dapply efficiently applies functions to columns or rows of matrix-like objects and by default returns an object of the same type and with the same attributes. Alternatively it is possible to return the result in a plain matrix or data.frame. A simple parallelism is also available.