传递给函数的数据帧操作非常慢

Question

在我的示例中，我有一个 3D 点云并且想要找到每个 z 层的轮廓。我目前的做法如下：

library(rgl) #just for 3D visualisation purposes of the cube

cube = data.frame(x = rep(1:10,1000),
                  y = rep(1:10, 100, each = 10),
                  z = rep(1:10,100,each = 100)) #3D point cloud

xyz_list = split(cube, cube[,3]) #split into layers by unique z-values

t0 = Sys.time()
outline = lapply(xyz_list, function(k){
  xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
  xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
  ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
  ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
  mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
  mm = mm[!duplicated(mm),] #remove duplicate rows
})
t1 = Sys.time()
print(t1 - t0)

outline = do.call(rbind,outline)#merge lists
plot3d(cube)
plot3d(outline, col = "red", add = TRUE, size = 5)

这大约需要。 0.33 secs

现在我考虑将数据帧 (xyz_list) 传递给 lapply 外部名为 of 的函数，并将所有代码从 lapply 内部移至该函数因为我以后需要重复操作几次：

of = function(df, dim1, dim2){
  xmax = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = max, data = df), df) 
  xmin = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = min, data = df), df) 
  ymax = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = max, data = df), df) 
  ymin = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = min, data = df), df) 
  mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
  mm = mm[!duplicated(mm),] #remove duplicate rows
  return(mm)
}

t0 = Sys.time()
outline = lapply(xyz_list, function(k){
  mm = of(k, 2, 1)
})
t1 = Sys.time()
print(t1 - t0)

这大约需要 13 secs。

我不明白为什么我的代码在第二个示例中变得如此慢。有什么方法可以使函数 of 更高效吗？

Answer 1

# dummy data
cube <- data.frame(x = rep(1:10,1000),
                  y = rep(1:10, 100, each = 10),
                  z = rep(1:10,100,each = 100)
                  )

# split into list
xyz_list <- split(cube, cube[,3])

op的原始方法（仅lapply）

outline <- lapply(xyz_list, function(k)
  {
  xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
  xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
  ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
  ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
  mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
  mm = mm[!duplicated(mm),] #remove duplicate rows
  })

op 尝试创建函数（然后 lapply）

of <- function(df, dim1, dim2)
  {
  xmax = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = max, data = df), df) 
  xmin = merge(aggregate(df[,dim1] ~ df[,dim2], FUN = min, data = df), df) 
  ymax = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = max, data = df), df) 
  ymin = merge(aggregate(df[,dim2] ~ df[,dim1], FUN = min, data = df), df) 
  mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
  mm = mm[!duplicated(mm),] #remove duplicate rows
  return(mm)
  }

新的、改进的功能

of1 <- function(df, y, x)
{
  y_x <- as.formula(paste(y, '~', x))
  x_y <- as.formula(paste(x, '~', y))
  xmax = merge(aggregate(y_x, FUN = max, data = df), df) 
  xmin = merge(aggregate(y_x, FUN = min, data = df), df) 
  ymax = merge(aggregate(x_y, FUN = max, data = df), df) 
  ymin = merge(aggregate(x_y, FUN = min, data = df), df) 
  mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
  mm = mm[!duplicated(mm),] #remove duplicate rows
  return(mm)
}

微基准测试

library(microbenchmark)
library(ggplot2)

a <-
  microbenchmark(original = {outline = lapply(xyz_list, function(k)
                                        {
                                        xmax = merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
                                        xmin = merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
                                        ymax = merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
                                        ymin = merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
                                        mm = rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
                                        mm = mm[!duplicated(mm),] #remove duplicate rows
                                        })}
                 , slow = {outline1 = lapply( xyz_list, function(k) { of(k, 2, 1) } )}
                 , improved = {outline2 = lapply( xyz_list, function(k) of1(k, 'y', 'x') )}
                 , times = 30
                 )
autoplot(a)

身份验证

identical(outline, outline2)
[1] TRUE

Answer 2

data.table解法：

我建议只根据需要对较大的 data.table 进行子集化，而不是通过 z 层将其拆分为单独的 data.table。但是如果确实需要 outline 作为 data.table 的列表，我们可以在汇总后将 z 层拆分出来：

library(data.table)

cube <- data.table(x = rep(1:10, 1000),
                   y = rep(1:10, 100, each = 10),
                   z = rep(1:10, 100, each = 100)) #3D point cloud

system.time({
  nms <- c("x", "y", "z")
  outline2 <- unique(rbindlist(lapply(1:2, function(i) setnames(cube[, .(range(.SD)), by = c(nms[-i]), .SDcols = nms[i]], "V1", nms[i])), use.names = TRUE))
  setcolorder(outline2, nms)
  outline2 <- split(outline2, outline2[[3]])
})
#>    user  system elapsed 
#>    0.03    0.00    0.05

与原始 non-function 解决方案比较：

system.time({
  xyz_list <- split(cube, cube[,3]) #split into layers by unique z-values
  
  outline1 <- lapply(xyz_list, function(k){
    xmax <- merge(aggregate(y ~ x, FUN = max, data = k), k) #maximum y-value for each unique x-value
    xmin <- merge(aggregate(y ~ x, FUN = min, data = k), k) #minimum y-value for each unique x-value
    ymax <- merge(aggregate(x ~ y, FUN = max, data = k), k) #maximum x-value for each unique y-value
    ymin <- merge(aggregate(x ~ y, FUN = min, data = k), k) #minimum x-value for each unique y-value
    mm <- rbind(xmax,xmin,ymax,ymin) #collect all minimum and maximum values
    mm <- mm[!duplicated(mm),] #remove duplicate rows
  })
})
#>    user  system elapsed 
#>    0.64    0.01    0.66

如果确实需要对 pre-split 层列表进行操作的函数：

of <- function(dt, dim1, dim2) {
  setcolorder(unique(rbindlist(lapply(c(dim1, dim2), function(i) setnames(dt[, .(range(.SD)), by = c(nms[-i]), .SDcols = nms[i]], "V1", nms[i])), use.names = TRUE)), nms)
}

system.time({
  outline3 <- lapply(xyz_list, function(k) of(k, 1, 2))
})
#>    user  system elapsed 
#>    0.06    0.00    0.06

我们将验证所有解决方案 return 具有相同的一组值。为了进行比较，我们需要将 outline1 data.frame 转换为 data.table 并重置它们的行名。我们还对所有 data.tables.

进行了排序

for (i in 1:length(outline1)) {
  setorder(setDT(outline1[[i]]))
  setorder(outline2[[i]])
  setorder(outline3[[i]])
  rownames(outline1[[i]]) <- NULL
}

identical(outline1, outline2)
#> [1] TRUE
identical(outline1, outline3)
#> [1] TRUE

Created on 2022-01-31 by the reprex package (v2.0.1)

传递给函数的数据帧操作非常慢

Operations on Data Frame passed to function are very slow

performance

r

dataframe

op的原始方法（仅lapply）

op 尝试创建函数（然后 lapply）

新的、改进的功能

微基准测试

身份验证