传递给函数的数据帧操作非常慢
Operations on Data Frame passed to function are very slow
在我的示例中,我有一个 3D 点云并且想要找到每个 z 层的轮廓。
我目前的做法如下:
library(rgl) #just for 3D visualisation purposes of the cube
cube = data.frame(x = rep(1:10,1000),
y = rep(1:10, 100, each = 10),
z = rep(1:10,100,each = 100)) #3D point cloud
xyz_list = split(cube, cube[,3]) #split into layers by unique z-values
t0 = Sys.time()
outline = lapply(xyz_list, function(k){
xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
})
t1 = Sys.time()
print(t1 - t0)
outline = do.call(rbind,outline)#merge lists
plot3d(cube)
plot3d(outline, col = "red", add = TRUE, size = 5)
这大约需要。 0.33 secs
现在我考虑将数据帧 (xyz_list
) 传递给 lapply
外部名为 of
的函数,并将所有代码从 lapply
内部移至该函数因为我以后需要重复操作几次:
of = function(df, dim1, dim2){
xmax = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = max, data = df), df)
xmin = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = min, data = df), df)
ymax = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = max, data = df), df)
ymin = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = min, data = df), df)
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
return(mm)
}
t0 = Sys.time()
outline = lapply(xyz_list, function(k){
mm = of(k, 2, 1)
})
t1 = Sys.time()
print(t1 - t0)
这大约需要 13 secs
。
我不明白为什么我的代码在第二个示例中变得如此慢。有什么方法可以使函数 of
更高效吗?
# dummy data
cube <- data.frame(x = rep(1:10,1000),
y = rep(1:10, 100, each = 10),
z = rep(1:10,100,each = 100)
)
# split into list
xyz_list <- split(cube, cube[,3])
op的原始方法(仅lapply)
outline <- lapply(xyz_list, function(k)
{
xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
})
op 尝试创建函数(然后 lapply)
of <- function(df, dim1, dim2)
{
xmax = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = max, data = df), df)
xmin = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = min, data = df), df)
ymax = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = max, data = df), df)
ymin = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = min, data = df), df)
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
return(mm)
}
新的、改进的功能
of1 <- function(df, y, x)
{
y_x <- as.formula(paste(y, '~', x))
x_y <- as.formula(paste(x, '~', y))
xmax = merge(aggregate(y_x, FUN = max, data = df), df)
xmin = merge(aggregate(y_x, FUN = min, data = df), df)
ymax = merge(aggregate(x_y, FUN = max, data = df), df)
ymin = merge(aggregate(x_y, FUN = min, data = df), df)
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
return(mm)
}
微基准测试
library(microbenchmark)
library(ggplot2)
a <-
microbenchmark(original = {outline = lapply(xyz_list, function(k)
{
xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
})}
, slow = {outline1 = lapply( xyz_list, function(k) { of(k, 2, 1) } )}
, improved = {outline2 = lapply( xyz_list, function(k) of1(k, 'y', 'x') )}
, times = 30
)
autoplot(a)
身份验证
identical(outline, outline2)
[1] TRUE
data.table
解法:
我建议只根据需要对较大的 data.table
进行子集化,而不是通过 z
层将其拆分为单独的 data.table
。但是如果确实需要 outline
作为 data.table
的列表,我们可以在汇总后将 z
层拆分出来:
library(data.table)
cube <- data.table(x = rep(1:10, 1000),
y = rep(1:10, 100, each = 10),
z = rep(1:10, 100, each = 100)) #3D point cloud
system.time({
nms <- c("x", "y", "z")
outline2 <- unique(rbindlist(lapply(1:2, function(i) setnames(cube[, .(range(.SD)), by = c(nms[-i]), .SDcols = nms[i]], "V1", nms[i])), use.names = TRUE))
setcolorder(outline2, nms)
outline2 <- split(outline2, outline2[[3]])
})
#> user system elapsed
#> 0.03 0.00 0.05
与原始 non-function 解决方案比较:
system.time({
xyz_list <- split(cube, cube[,3]) #split into layers by unique z-values
outline1 <- lapply(xyz_list, function(k){
xmax <- merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin <- merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax <- merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin <- merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm <- rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm <- mm[!duplicated(mm),] #remove duplicate rows
})
})
#> user system elapsed
#> 0.64 0.01 0.66
如果确实需要对 pre-split 层列表进行操作的函数:
of <- function(dt, dim1, dim2) {
setcolorder(unique(rbindlist(lapply(c(dim1, dim2), function(i) setnames(dt[, .(range(.SD)), by = c(nms[-i]), .SDcols = nms[i]], "V1", nms[i])), use.names = TRUE)), nms)
}
system.time({
outline3 <- lapply(xyz_list, function(k) of(k, 1, 2))
})
#> user system elapsed
#> 0.06 0.00 0.06
我们将验证所有解决方案 return 具有相同的一组值。为了进行比较,我们需要将 outline1
data.frame
转换为 data.table
并重置它们的行名。我们还对所有 data.table
s.
进行了排序
for (i in 1:length(outline1)) {
setorder(setDT(outline1[[i]]))
setorder(outline2[[i]])
setorder(outline3[[i]])
rownames(outline1[[i]]) <- NULL
}
identical(outline1, outline2)
#> [1] TRUE
identical(outline1, outline3)
#> [1] TRUE
Created on 2022-01-31 by the reprex package (v2.0.1)
在我的示例中,我有一个 3D 点云并且想要找到每个 z 层的轮廓。 我目前的做法如下:
library(rgl) #just for 3D visualisation purposes of the cube
cube = data.frame(x = rep(1:10,1000),
y = rep(1:10, 100, each = 10),
z = rep(1:10,100,each = 100)) #3D point cloud
xyz_list = split(cube, cube[,3]) #split into layers by unique z-values
t0 = Sys.time()
outline = lapply(xyz_list, function(k){
xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
})
t1 = Sys.time()
print(t1 - t0)
outline = do.call(rbind,outline)#merge lists
plot3d(cube)
plot3d(outline, col = "red", add = TRUE, size = 5)
这大约需要。 0.33 secs
现在我考虑将数据帧 (xyz_list
) 传递给 lapply
外部名为 of
的函数,并将所有代码从 lapply
内部移至该函数因为我以后需要重复操作几次:
of = function(df, dim1, dim2){
xmax = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = max, data = df), df)
xmin = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = min, data = df), df)
ymax = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = max, data = df), df)
ymin = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = min, data = df), df)
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
return(mm)
}
t0 = Sys.time()
outline = lapply(xyz_list, function(k){
mm = of(k, 2, 1)
})
t1 = Sys.time()
print(t1 - t0)
这大约需要 13 secs
。
我不明白为什么我的代码在第二个示例中变得如此慢。有什么方法可以使函数 of
更高效吗?
# dummy data
cube <- data.frame(x = rep(1:10,1000),
y = rep(1:10, 100, each = 10),
z = rep(1:10,100,each = 100)
)
# split into list
xyz_list <- split(cube, cube[,3])
op的原始方法(仅lapply)
outline <- lapply(xyz_list, function(k)
{
xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
})
op 尝试创建函数(然后 lapply)
of <- function(df, dim1, dim2)
{
xmax = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = max, data = df), df)
xmin = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = min, data = df), df)
ymax = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = max, data = df), df)
ymin = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = min, data = df), df)
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
return(mm)
}
新的、改进的功能
of1 <- function(df, y, x)
{
y_x <- as.formula(paste(y, '~', x))
x_y <- as.formula(paste(x, '~', y))
xmax = merge(aggregate(y_x, FUN = max, data = df), df)
xmin = merge(aggregate(y_x, FUN = min, data = df), df)
ymax = merge(aggregate(x_y, FUN = max, data = df), df)
ymin = merge(aggregate(x_y, FUN = min, data = df), df)
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
return(mm)
}
微基准测试
library(microbenchmark)
library(ggplot2)
a <-
microbenchmark(original = {outline = lapply(xyz_list, function(k)
{
xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm = mm[!duplicated(mm),] #remove duplicate rows
})}
, slow = {outline1 = lapply( xyz_list, function(k) { of(k, 2, 1) } )}
, improved = {outline2 = lapply( xyz_list, function(k) of1(k, 'y', 'x') )}
, times = 30
)
autoplot(a)
身份验证
identical(outline, outline2)
[1] TRUE
data.table
解法:
我建议只根据需要对较大的 data.table
进行子集化,而不是通过 z
层将其拆分为单独的 data.table
。但是如果确实需要 outline
作为 data.table
的列表,我们可以在汇总后将 z
层拆分出来:
library(data.table)
cube <- data.table(x = rep(1:10, 1000),
y = rep(1:10, 100, each = 10),
z = rep(1:10, 100, each = 100)) #3D point cloud
system.time({
nms <- c("x", "y", "z")
outline2 <- unique(rbindlist(lapply(1:2, function(i) setnames(cube[, .(range(.SD)), by = c(nms[-i]), .SDcols = nms[i]], "V1", nms[i])), use.names = TRUE))
setcolorder(outline2, nms)
outline2 <- split(outline2, outline2[[3]])
})
#> user system elapsed
#> 0.03 0.00 0.05
与原始 non-function 解决方案比较:
system.time({
xyz_list <- split(cube, cube[,3]) #split into layers by unique z-values
outline1 <- lapply(xyz_list, function(k){
xmax <- merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
xmin <- merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
ymax <- merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
ymin <- merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
mm <- rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
mm <- mm[!duplicated(mm),] #remove duplicate rows
})
})
#> user system elapsed
#> 0.64 0.01 0.66
如果确实需要对 pre-split 层列表进行操作的函数:
of <- function(dt, dim1, dim2) {
setcolorder(unique(rbindlist(lapply(c(dim1, dim2), function(i) setnames(dt[, .(range(.SD)), by = c(nms[-i]), .SDcols = nms[i]], "V1", nms[i])), use.names = TRUE)), nms)
}
system.time({
outline3 <- lapply(xyz_list, function(k) of(k, 1, 2))
})
#> user system elapsed
#> 0.06 0.00 0.06
我们将验证所有解决方案 return 具有相同的一组值。为了进行比较,我们需要将 outline1
data.frame
转换为 data.table
并重置它们的行名。我们还对所有 data.table
s.
for (i in 1:length(outline1)) {
setorder(setDT(outline1[[i]]))
setorder(outline2[[i]])
setorder(outline3[[i]])
rownames(outline1[[i]]) <- NULL
}
identical(outline1, outline2)
#> [1] TRUE
identical(outline1, outline3)
#> [1] TRUE
Created on 2022-01-31 by the reprex package (v2.0.1)