使用 for 循环更高效地创建新变量
More efficiency creating a new variable using for loop
我想知道是否有更有效的方法来做到这一点,因为我有一个数百万长的数据集,在这一步卡住了好几天。
for (i in 1:32000000){
if (!exists("df")){
df <- as.data.frame(Properties[[i]])
df <- as.data.frame(t(df))
}else{
temp_dataset <- as.data.frame(Properties[[i]])
temp_dataset <- as.data.frame(t(temp_dataset))
df <- rbind(df, temp_dataset)
rm(temp_dataset)
}
}
基本上所做的是创建一个新变量并添加新行作为变量 i
通过 1:32000000
的进展。但是,正如我所说,这需要很多时间,所以我需要一种更有效的方法。
Properties
看起来像:
List of 32000000
$ : Named num [1:3] -0.85 -0.544 0.208
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.332 -0.698 0.264
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.768 -0.486 0.184
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.458 -0.57 -0.054
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.536 -0.458 0.348
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.47 -0.776 0.06
您可以尝试使用 data.table
中的 transpose()
。这应该很快。
示例数据:
n <- 100000
Properties <- replicate(n, setNames(runif(3), c("PP1", "PP2", "PP3")), simplify = FALSE)
head(Properties, 3)
# [[1]]
# PP1 PP2 PP3
# 0.8036237 0.9423731 0.9593770
#
# [[2]]
# PP1 PP2 PP3
# 0.1906879 0.5571697 0.9718734
#
# [[3]]
# PP1 PP2 PP3
# 0.7542362 0.3420677 0.4541527
堆叠代码:
df <- as.data.frame(data.table::transpose(Properties),
col.names = c("PP1", "PP2", "PP3"))
基准:
microbenchmark::microbenchmark(
do.call = do.call(rbind, Properties),
data.table = as.data.frame(data.table::transpose(Properties),
col.names = c("PP1", "PP2", "PP3")))
# Unit: milliseconds
# expr min lq mean median uq max neval
# do.call 74.2183 83.29040 107.001017 96.63925 113.61070 322.4556 100
# data.table 4.6864 5.06845 6.163916 5.30285 5.56845 73.3627 100
一种方法是在 do.call
中使用 rbind
。
do.call(rbind, Properties)
基准:(基于@Adam)
set.seed(42)
n <- 1e5
Properties <- replicate(n, setNames(runif(3), c("PP1", "PP2", "PP3")), simplify = FALSE)
bench::mark(check = FALSE
, dplyr = dplyr::bind_rows(Properties)
, rbind = do.call(rbind, Properties)
, data.table = setNames(data.table::transpose(Properties),
names(Properties[[1]]))
, unlist = matrix(unlist(Properties, FALSE, FALSE),
ncol=length(Properties[[1]]), byrow=TRUE,
dimnames = list(NULL, names(Properties[[1]])))
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 dplyr 1.53s 1.53s 0.652 8.78MB 10.4 1 16 1.53s
#2 rbind 74.19ms 86.81ms 6.92 2.29MB 3.46 4 2 578.4ms
#3 data.table 4.31ms 5.28ms 185. 4.58MB 12.0 93 6 501.56ms
#4 unlist 2.8ms 3.38ms 256. 4.58MB 22.0 128 11 500.31ms
#All have to return the same
bench::mark(
dplyr = as.matrix(dplyr::bind_rows(Properties))
, rbind = do.call(rbind, Properties)
, data.table = do.call(cbind, setNames(data.table::transpose(Properties),
names(Properties[[1]])))
, unlist = matrix(unlist(Properties, FALSE, FALSE),
ncol=length(Properties[[1]]), byrow=TRUE,
dimnames = list(NULL, names(Properties[[1]])))
)
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 dplyr 1.49s 1.49s 0.673 11.06MB 8.75 1 13 1.49s
#2 rbind 73.26ms 87.91ms 7.68 2.29MB 5.12 6 4 781.51ms
#3 data.table 4.95ms 5.7ms 166. 6.87MB 19.7 84 10 507.06ms
#4 unlist 3.02ms 3.35ms 276. 4.58MB 32.0 138 16 500.4ms
在这种情况下,使用 unlist
和 matrix
是最快的方法。
我想知道是否有更有效的方法来做到这一点,因为我有一个数百万长的数据集,在这一步卡住了好几天。
for (i in 1:32000000){
if (!exists("df")){
df <- as.data.frame(Properties[[i]])
df <- as.data.frame(t(df))
}else{
temp_dataset <- as.data.frame(Properties[[i]])
temp_dataset <- as.data.frame(t(temp_dataset))
df <- rbind(df, temp_dataset)
rm(temp_dataset)
}
}
基本上所做的是创建一个新变量并添加新行作为变量 i
通过 1:32000000
的进展。但是,正如我所说,这需要很多时间,所以我需要一种更有效的方法。
Properties
看起来像:
List of 32000000
$ : Named num [1:3] -0.85 -0.544 0.208
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.332 -0.698 0.264
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.768 -0.486 0.184
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.458 -0.57 -0.054
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.536 -0.458 0.348
..- attr(*, "names")= chr [1:3] "PP1" "PP2" "PP3"
$ : Named num [1:3] -0.47 -0.776 0.06
您可以尝试使用 data.table
中的 transpose()
。这应该很快。
示例数据:
n <- 100000
Properties <- replicate(n, setNames(runif(3), c("PP1", "PP2", "PP3")), simplify = FALSE)
head(Properties, 3)
# [[1]]
# PP1 PP2 PP3
# 0.8036237 0.9423731 0.9593770
#
# [[2]]
# PP1 PP2 PP3
# 0.1906879 0.5571697 0.9718734
#
# [[3]]
# PP1 PP2 PP3
# 0.7542362 0.3420677 0.4541527
堆叠代码:
df <- as.data.frame(data.table::transpose(Properties),
col.names = c("PP1", "PP2", "PP3"))
基准:
microbenchmark::microbenchmark(
do.call = do.call(rbind, Properties),
data.table = as.data.frame(data.table::transpose(Properties),
col.names = c("PP1", "PP2", "PP3")))
# Unit: milliseconds
# expr min lq mean median uq max neval
# do.call 74.2183 83.29040 107.001017 96.63925 113.61070 322.4556 100
# data.table 4.6864 5.06845 6.163916 5.30285 5.56845 73.3627 100
一种方法是在 do.call
中使用 rbind
。
do.call(rbind, Properties)
基准:(基于@Adam)
set.seed(42)
n <- 1e5
Properties <- replicate(n, setNames(runif(3), c("PP1", "PP2", "PP3")), simplify = FALSE)
bench::mark(check = FALSE
, dplyr = dplyr::bind_rows(Properties)
, rbind = do.call(rbind, Properties)
, data.table = setNames(data.table::transpose(Properties),
names(Properties[[1]]))
, unlist = matrix(unlist(Properties, FALSE, FALSE),
ncol=length(Properties[[1]]), byrow=TRUE,
dimnames = list(NULL, names(Properties[[1]])))
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 dplyr 1.53s 1.53s 0.652 8.78MB 10.4 1 16 1.53s
#2 rbind 74.19ms 86.81ms 6.92 2.29MB 3.46 4 2 578.4ms
#3 data.table 4.31ms 5.28ms 185. 4.58MB 12.0 93 6 501.56ms
#4 unlist 2.8ms 3.38ms 256. 4.58MB 22.0 128 11 500.31ms
#All have to return the same
bench::mark(
dplyr = as.matrix(dplyr::bind_rows(Properties))
, rbind = do.call(rbind, Properties)
, data.table = do.call(cbind, setNames(data.table::transpose(Properties),
names(Properties[[1]])))
, unlist = matrix(unlist(Properties, FALSE, FALSE),
ncol=length(Properties[[1]]), byrow=TRUE,
dimnames = list(NULL, names(Properties[[1]])))
)
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 dplyr 1.49s 1.49s 0.673 11.06MB 8.75 1 13 1.49s
#2 rbind 73.26ms 87.91ms 7.68 2.29MB 5.12 6 4 781.51ms
#3 data.table 4.95ms 5.7ms 166. 6.87MB 19.7 84 10 507.06ms
#4 unlist 3.02ms 3.35ms 276. 4.58MB 32.0 138 16 500.4ms
在这种情况下,使用 unlist
和 matrix
是最快的方法。