R data.table 按行填充行 NA
R data.table fill row NAs, by row
我想使用 'locf' 在 data.table 中填写行 NA,但要分别处理每一行。我似乎无法从以下结果中得到结果;
require(data.table)
set.seed(456)
# some dummy data
dt <- data.table(a = sample(1:4,6, replace=T), b = sample(1:4,6, replace=T), c = sample(1:4,6, replace=T),
d = sample(1:4,6, replace=T), e = sample(1:4,6, replace=T), f = sample(1:4,6, replace=T),
g = sample(1:4,6, replace=T), h = sample(1:4,6, replace=T), i = sample(1:4,6, replace=T),
j = sample(1:4,6, replace=T), xx = sample(1:4,6, replace=T))
dt[4, c:=NA]
dt[1, g:=NA]
dt[1, h:=NA]
# set colnames
cols <- setdiff(names(dt),"xx")
# use nafill over rows
dt[, (cols) := nafill(.SD, type="locf"), seq_len(nrow(dt)), .SDcols = cols]
结果和原来的没什么区别table,我错过了什么
a b c d e f g h i j xx
1: 1 3 3 2 3 1 NA NA 4 3 1
2: 1 1 2 2 1 2 2 1 2 4 1
3: 3 2 3 1 1 4 3 3 2 1 2
4: 2 3 NA 1 2 2 1 4 3 4 2
5: 1 2 3 4 4 3 2 2 2 4 3
6: 4 1 4 2 1 4 4 3 3 4 3
(N.B。实际数据是1200万行,如果对性能有影响的话)
一个好方法,使用 for
循环。它不是逐行的,它一次对“所有在'X'列中具有NA
的行进行操作”,对于[=20中的每一列=].
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 3 3 2 3 1 1 1 4 3 1
# 2: 1 1 2 2 1 2 2 1 2 4 1
# 3: 3 2 3 1 1 4 3 3 2 1 2
# 4: 2 3 3 1 2 2 1 4 3 4 2
# 5: 1 2 3 4 4 3 2 2 2 4 3
# 6: 4 1 4 2 1 4 4 3 3 4 3
诚然 get(.)
的使用并不 完美 ,但我认为它通常没问题。
另一种方法,速度差不多(取决于数据的大小):
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this, prev), .SD, accumulate = TRUE), .SDcols = cols]
# same results
基准测试,既然你说 2M 行,性能很重要。
我将使用 2M 行并更新随机化 NA
s 的方法。
library(data.table)
set.seed(456)
n <- 2e6 # 6e5
dt <- data.table(a = sample(1:4,n, replace=T), b = sample(1:4,n, replace=T), c = sample(1:4,n, replace=T), d = sample(1:4,n, replace=T), e = sample(1:4,n, replace=T), f = sample(1:4,n, replace=T), g = sample(1:4,n, replace=T), h = sample(1:4,n, replace=T), i = sample(1:4,n, replace=T), j = sample(1:4,n, replace=T), xx = sample(1:4,n, replace=T))
mtx <- cbind(sample(nrow(dt), ceiling(n*11/20), replace=TRUE), sample(ncol(dt), ceiling(n*11/20), replace=TRUE))
mtx <- mtx[!duplicated(mtx),]
dt[mtx] <- NA
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 NA 1
# 4: 2 1 4 1 2 3 NA 4 4 4 3
# 5: 1 2 3 3 4 3 3 NA 1 4 1
# 6: 4 3 4 2 2 NA 4 1 2 4 2
不幸的是,transpose
方法失败了:
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# Error: cannot allocate vector of size 30.6 Gb
但是 for
循环(和 Reduce
,顺便说一句)工作正常:
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.14 0.00 0.11
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 1 1
# 4: 2 1 4 1 2 3 3 4 4 4 3
# 5: 1 2 3 3 4 3 3 3 1 4 1
# 6: 4 3 4 2 2 2 4 1 2 4 2
如果我将问题集简化为 600K 行,那么我就可以使两者都起作用。 (我不知道我的系统的倾翻点......它可能是 1M,谁知道,我只是想并排比较它们。)使用 n <- 6e5
并生成 dt
,我看到以下数据和简单的计时:
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 3 1 3 4 NA 3 3 3 3
# 2: 1 3 2 2 4 3 1 2 2 4 1
# 3: 3 4 2 1 1 1 1 4 2 4 2
# 4: 2 4 1 NA 1 4 3 1 4 1 1
# 5: 1 NA 4 2 NA NA 4 4 2 2 NA
# 6: 4 1 4 4 1 2 3 3 1 1 2
sum(is.na(dt))
# [1] 321782
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# user system elapsed
# 4.27 4.50 7.74
sum(is.na(dt)) # 'dt' is unchanged, only important here to compare the 'for' loop
# [1] 321782
sum(is.na(dt2)) # rows with leading columns having 'NA', nothing to coalesce, not surprising
# [1] 30738
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.10 0.03 0.06
identical(dt, dt2)
# [1] TRUE
### regenerate `dt` so it has `NA`s again
system.time({
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
})
# user system elapsed
# 0.03 0.00 0.03
identical(dt, dt2)
# [1] TRUE
一个更强大的基准,如 bench::mark
将因需要 copy(dt)
每次通过而受到一些阻碍。虽然这个开销并不大,
bench::mark(copy(dt))
# # A tibble: 1 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 copy(dt) 7.77ms 20.9ms 45.1 25.2MB 0 23 0 510ms <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [14 x 3]> <bch:tm [23]> <tibble [23 x 3]>
还是额外的。因此,我将比较 transpose
代码两次,一次有,一次没有,以便更诚实地将它与 for
和 reduce
答案进行比较。 (注意bench::mark
的默认动作是验证所有的输出都是identical
。这个可以禁用,但是我没有那样做,所以所有的代码块return都是一样的结果。 )
bench::mark(
transpose1 = {
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
dt2
},
transpose2 = {
dt0 = copy(dt)
dt2 = transpose(dt0)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt0))
dt2
},
forloop = {
dt0 <- copy(dt)
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt0[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
reduce = {
dt0 <- copy(dt)
dt0[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
min_iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 transpose1 4.94s 5.48s 0.154 1.28GB 0.201 10 13 1.08m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [33,008 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 2 transpose2 5.85s 6.29s 0.130 1.3GB 0.259 10 20 1.29m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [15,316 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 3 forloop 48.37ms 130.91ms 2.87 71.14MB 0 10 0 3.49s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [191 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 4 reduce 48.08ms 75.82ms 4.70 71MB 0.470 10 1 2.13s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [38 x 3]> <bch:tm [10]> <tibble [10 x 3]>
来自这里:
- Time:归一化为毫秒后,4840ms 与 48ms 相比表现不佳;和
- 内存:1.28GB 与 71MB 相比表现不佳。
(编辑 将基准测试的最小迭代次数增加到 10。)
来回调换 data.table 怎么样?
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
注意:从@r2evans 的回答可以看出,这个解决方案相当慢。
解决此问题的另一种方法是使用 set
函数。该解决方案既快速又非常节省内存。我还将它与 @r2evans forloop
和 Reduce
案例在 12M 行 data.table.
上进行了比较
我还考虑了@erevens 回答中 forloop
案例的一个修改版本(下面的 forloop1
)。新版本包括简单地删除 data.table 参数 i (is.na(get(thiscol))
) 中的表达式。与原始版本相比,此更改有助于提高内存使用率和性能。
library(data.table)
for(cl in seq_along(cols)[-1L]) set(dt, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
基准
n <- 12e6
set.seed(0123456789)
d <- setDT(replicate(7, sample(c(1:4, NA), n, TRUE, (5:1)/15), simplify=FALSE))
setnames(d, c(letters[1:6], "xx"))
cols <- setdiff(names(d),"xx")
dt0 <- copy(d)
dt1 <- copy(d)
dt2 <- copy(d)
dt3 <- copy(d)
bench::mark(
# modified version
forloop1 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
# i not specified
dt0[, (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
# original version
forloop2 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt1[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt1
},
reduce = {
dt2[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
set = {
for(cl in seq_along(cols)[-1L]) set(dt3, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
dt3
},
min_iterations = 5L)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 forloop1 77.1ms 87.9ms 10.9 229MB 2.74 4 1 366ms <data.table [12,000,000 x 7]> <Rprofmem [134 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 2 forloop2 192.8ms 201.3ms 5.01 460MB 3.34 3 2 599ms <data.table [12,000,000 x 7]> <Rprofmem [183 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 3 reduce 114.5ms 130.2ms 7.76 458MB 5.17 3 2 387ms <data.table [12,000,000 x 7]> <Rprofmem [21 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 4 set 65.6ms 68.5ms 14.5 229MB 9.65 3 2 207ms <data.table [12,000,000 x 7]> <Rprofmem [76 x 3]> <bench_tm [5]> <tibble [5 x 3]>
使用 set
函数可以提高性能和内存使用率。我个人更关心中位数时间(而不是 total_time
)。
我想使用 'locf' 在 data.table 中填写行 NA,但要分别处理每一行。我似乎无法从以下结果中得到结果;
require(data.table)
set.seed(456)
# some dummy data
dt <- data.table(a = sample(1:4,6, replace=T), b = sample(1:4,6, replace=T), c = sample(1:4,6, replace=T),
d = sample(1:4,6, replace=T), e = sample(1:4,6, replace=T), f = sample(1:4,6, replace=T),
g = sample(1:4,6, replace=T), h = sample(1:4,6, replace=T), i = sample(1:4,6, replace=T),
j = sample(1:4,6, replace=T), xx = sample(1:4,6, replace=T))
dt[4, c:=NA]
dt[1, g:=NA]
dt[1, h:=NA]
# set colnames
cols <- setdiff(names(dt),"xx")
# use nafill over rows
dt[, (cols) := nafill(.SD, type="locf"), seq_len(nrow(dt)), .SDcols = cols]
结果和原来的没什么区别table,我错过了什么
a b c d e f g h i j xx
1: 1 3 3 2 3 1 NA NA 4 3 1
2: 1 1 2 2 1 2 2 1 2 4 1
3: 3 2 3 1 1 4 3 3 2 1 2
4: 2 3 NA 1 2 2 1 4 3 4 2
5: 1 2 3 4 4 3 2 2 2 4 3
6: 4 1 4 2 1 4 4 3 3 4 3
(N.B。实际数据是1200万行,如果对性能有影响的话)
一个好方法,使用 for
循环。它不是逐行的,它一次对“所有在'X'列中具有NA
的行进行操作”,对于[=20中的每一列=].
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 3 3 2 3 1 1 1 4 3 1
# 2: 1 1 2 2 1 2 2 1 2 4 1
# 3: 3 2 3 1 1 4 3 3 2 1 2
# 4: 2 3 3 1 2 2 1 4 3 4 2
# 5: 1 2 3 4 4 3 2 2 2 4 3
# 6: 4 1 4 2 1 4 4 3 3 4 3
诚然 get(.)
的使用并不 完美 ,但我认为它通常没问题。
另一种方法,速度差不多(取决于数据的大小):
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this, prev), .SD, accumulate = TRUE), .SDcols = cols]
# same results
基准测试,既然你说 2M 行,性能很重要。
我将使用 2M 行并更新随机化 NA
s 的方法。
library(data.table)
set.seed(456)
n <- 2e6 # 6e5
dt <- data.table(a = sample(1:4,n, replace=T), b = sample(1:4,n, replace=T), c = sample(1:4,n, replace=T), d = sample(1:4,n, replace=T), e = sample(1:4,n, replace=T), f = sample(1:4,n, replace=T), g = sample(1:4,n, replace=T), h = sample(1:4,n, replace=T), i = sample(1:4,n, replace=T), j = sample(1:4,n, replace=T), xx = sample(1:4,n, replace=T))
mtx <- cbind(sample(nrow(dt), ceiling(n*11/20), replace=TRUE), sample(ncol(dt), ceiling(n*11/20), replace=TRUE))
mtx <- mtx[!duplicated(mtx),]
dt[mtx] <- NA
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 NA 1
# 4: 2 1 4 1 2 3 NA 4 4 4 3
# 5: 1 2 3 3 4 3 3 NA 1 4 1
# 6: 4 3 4 2 2 NA 4 1 2 4 2
不幸的是,transpose
方法失败了:
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# Error: cannot allocate vector of size 30.6 Gb
但是 for
循环(和 Reduce
,顺便说一句)工作正常:
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.14 0.00 0.11
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 1 1
# 4: 2 1 4 1 2 3 3 4 4 4 3
# 5: 1 2 3 3 4 3 3 3 1 4 1
# 6: 4 3 4 2 2 2 4 1 2 4 2
如果我将问题集简化为 600K 行,那么我就可以使两者都起作用。 (我不知道我的系统的倾翻点......它可能是 1M,谁知道,我只是想并排比较它们。)使用 n <- 6e5
并生成 dt
,我看到以下数据和简单的计时:
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 3 1 3 4 NA 3 3 3 3
# 2: 1 3 2 2 4 3 1 2 2 4 1
# 3: 3 4 2 1 1 1 1 4 2 4 2
# 4: 2 4 1 NA 1 4 3 1 4 1 1
# 5: 1 NA 4 2 NA NA 4 4 2 2 NA
# 6: 4 1 4 4 1 2 3 3 1 1 2
sum(is.na(dt))
# [1] 321782
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# user system elapsed
# 4.27 4.50 7.74
sum(is.na(dt)) # 'dt' is unchanged, only important here to compare the 'for' loop
# [1] 321782
sum(is.na(dt2)) # rows with leading columns having 'NA', nothing to coalesce, not surprising
# [1] 30738
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.10 0.03 0.06
identical(dt, dt2)
# [1] TRUE
### regenerate `dt` so it has `NA`s again
system.time({
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
})
# user system elapsed
# 0.03 0.00 0.03
identical(dt, dt2)
# [1] TRUE
一个更强大的基准,如 bench::mark
将因需要 copy(dt)
每次通过而受到一些阻碍。虽然这个开销并不大,
bench::mark(copy(dt))
# # A tibble: 1 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 copy(dt) 7.77ms 20.9ms 45.1 25.2MB 0 23 0 510ms <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [14 x 3]> <bch:tm [23]> <tibble [23 x 3]>
还是额外的。因此,我将比较 transpose
代码两次,一次有,一次没有,以便更诚实地将它与 for
和 reduce
答案进行比较。 (注意bench::mark
的默认动作是验证所有的输出都是identical
。这个可以禁用,但是我没有那样做,所以所有的代码块return都是一样的结果。 )
bench::mark(
transpose1 = {
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
dt2
},
transpose2 = {
dt0 = copy(dt)
dt2 = transpose(dt0)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt0))
dt2
},
forloop = {
dt0 <- copy(dt)
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt0[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
reduce = {
dt0 <- copy(dt)
dt0[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
min_iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 transpose1 4.94s 5.48s 0.154 1.28GB 0.201 10 13 1.08m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [33,008 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 2 transpose2 5.85s 6.29s 0.130 1.3GB 0.259 10 20 1.29m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [15,316 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 3 forloop 48.37ms 130.91ms 2.87 71.14MB 0 10 0 3.49s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [191 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 4 reduce 48.08ms 75.82ms 4.70 71MB 0.470 10 1 2.13s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [38 x 3]> <bch:tm [10]> <tibble [10 x 3]>
来自这里:
- Time:归一化为毫秒后,4840ms 与 48ms 相比表现不佳;和
- 内存:1.28GB 与 71MB 相比表现不佳。
(编辑 将基准测试的最小迭代次数增加到 10。)
来回调换 data.table 怎么样?
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
注意:从@r2evans 的回答可以看出,这个解决方案相当慢。
解决此问题的另一种方法是使用 set
函数。该解决方案既快速又非常节省内存。我还将它与 @r2evans forloop
和 Reduce
案例在 12M 行 data.table.
我还考虑了@erevens 回答中 forloop
案例的一个修改版本(下面的 forloop1
)。新版本包括简单地删除 data.table 参数 i (is.na(get(thiscol))
) 中的表达式。与原始版本相比,此更改有助于提高内存使用率和性能。
library(data.table)
for(cl in seq_along(cols)[-1L]) set(dt, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
基准
n <- 12e6
set.seed(0123456789)
d <- setDT(replicate(7, sample(c(1:4, NA), n, TRUE, (5:1)/15), simplify=FALSE))
setnames(d, c(letters[1:6], "xx"))
cols <- setdiff(names(d),"xx")
dt0 <- copy(d)
dt1 <- copy(d)
dt2 <- copy(d)
dt3 <- copy(d)
bench::mark(
# modified version
forloop1 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
# i not specified
dt0[, (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
# original version
forloop2 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt1[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt1
},
reduce = {
dt2[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
set = {
for(cl in seq_along(cols)[-1L]) set(dt3, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
dt3
},
min_iterations = 5L)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 forloop1 77.1ms 87.9ms 10.9 229MB 2.74 4 1 366ms <data.table [12,000,000 x 7]> <Rprofmem [134 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 2 forloop2 192.8ms 201.3ms 5.01 460MB 3.34 3 2 599ms <data.table [12,000,000 x 7]> <Rprofmem [183 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 3 reduce 114.5ms 130.2ms 7.76 458MB 5.17 3 2 387ms <data.table [12,000,000 x 7]> <Rprofmem [21 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 4 set 65.6ms 68.5ms 14.5 229MB 9.65 3 2 207ms <data.table [12,000,000 x 7]> <Rprofmem [76 x 3]> <bench_tm [5]> <tibble [5 x 3]>
使用 set
函数可以提高性能和内存使用率。我个人更关心中位数时间(而不是 total_time
)。