将向量分成块,使每个块的总和近似恒定

Split a vector into chunks such that sum of each chunk is approximately constant

我有一个大型数据框,其中包含超过 100 000 条记录,其中的值已排序

例如,考虑以下虚拟数据集

df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))

我想创建 3 组上述值(仅按顺序),使每组的总和大致相同

所以对于上面的组,如果我决定将排序后的 df 分成如下 3 组,它们的总和将为

1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13

如何在 R 中创建此优化?有逻辑吗?

N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4

#$`2`
#[1] 5 6

#$`3`
#[1] 6 7

这是一种方法:

splitter <- function(values, N){
  inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
  dif = diff(inds)
  re = rep(1:length(dif), times = dif)
  return(split(values, re))
}

有多好:

# I calculate the mean and sd of the maximal difference of the sums in the 
#splits of 100 runs:

#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
  values = sort(sample(1:1000, 1000000, replace = T))
  z = splitter(values, 15)
  z = lapply(z, sum)
  z = unlist(z)
  z1 = rbind(z1, z)
  if (nrow(z1)>101){
    break
    }
}

z1 = z1[-1,] 
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653

#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
  values = sort(sample(1:1000, 1000000, replace = T))
  z = splitter(values, 4)
  z = lapply(z, sum)
  z = unlist(z)
  z1 = rbind(z1, z)
  if (nrow(z1)>101){
    break
    }
}

z1 = z1[-1,] 
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864


library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))

microbenchmark(
  sp_27 = splitter(values, 27),
  sp_3 = splitter(values, 3),
)

Unit: milliseconds
   expr      min       lq      mean    median        uq       max neval cld
  sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044   100   b
   sp_3 108.3283 116.2223  209.4777  173.0522  291.8669  409.7050   100  a 

顺便说一句,F.Privé 是正确的,这个函数没有给出全局最优分割。它是贪婪的,这对于这样的问题来说不是一个好的特征。它将在向量的初始部分给出总和更接近全局 sum / n 的拆分,但这样做会损害向量后面部分的拆分。

这里是目前发布的三个功能的测试比较:

db = function(values, N){
  temp = floor(sum(values)/N)
  inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
  dif = diff(inds)
  re = rep(1:length(dif), times = dif)
  return(split(values, re))
} #had to change it a bit since the posted one would not work - the core 
  #which calculates the splitting positions is the same

missuse <- function(values, N){
  inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
  dif = diff(inds)
  re = rep(1:length(dif), times = dif)
  return(split(values, re))
}

prive = function(v, N){ #added dummy N argument because of the tester function
  dummy = N
  computeD <- function(p, q, S) {
    n <- length(S)
    S.star <- S[n] / 3
    if (all(p < q)) {
      (S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
    } else {
      stop("You shouldn't be here!")
    }
  }
  optiCut <- function(v, N) {
    S <- cumsum(v)
    n <- length(v)
    S_star <- S[n] / 3
    # good starting values
    p_star <- which.min((S - S_star)^2)
    q_star <- which.min((S - 2*S_star)^2)
    print(min <- computeD(p_star, q_star, S))

    count <- 0
    for (q in 2:(n-1)) {
      S3 <- S[n] - S[q] - S_star
      if (S3*S3 < min) {
        count <- count + 1
        D <- computeD(seq_len(q - 1), q, S)
        ind = which.min(D);
        if (D[ind] < min) {
          # Update optimal values
          p_star = ind;
          q_star = q;
          min = D[ind];
        }
      }
    }
    c(p_star, q_star, computeD(p_star, q_star, S), count)
  }
  z3 = optiCut(v)
  inds = c(0, z3[1:2], length(v))
  dif = diff(inds)
  re = rep(1:length(dif), times = dif)
  return(split(v, re))
} #added output to be more in line with the other two

测试函数:

tester = function(split, seed){
  set.seed(seed)
  z1 = as.data.frame(matrix(1:3, nrow=1))
  repeat{
    values = sort(sample(1:1000, 1000000, replace = T))
    z = split(values, 3)
    z = lapply(z, sum)
    z = unlist(z)
    z1 = rbind(z1, z)
    if (nrow(z1)>101){
      break
    }
  }
  m = mean(apply(z1, 1, function(x) max(x) - min(x)))
  s = sd(apply(z1, 1, function(x) max(x) - min(x)))
  return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000

tester(db, 5)
#mean       sd 
#779.5686 349.5717 

tester(missuse, 5)
#mean       sd 
#481.4804 216.9158 

tester(prive, 5)
#mean       sd 
#451.6765 174.6303 

prive 是明显的赢家 - 但是它比其他 2 个花费的时间要长得多,并且只能处理 3 个元素的拆分。

microbenchmark(
  missuse(values, 3),
  prive(values, 3),
  db(values, 3)
)
Unit: milliseconds
               expr        min        lq      mean    median        uq       max neval cld
 missuse(values, 3)  100.85978  111.1552  185.8199  120.1707  304.0303  393.4031   100  a 
   prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357   100   b
      db(values, 3)   96.86879  104.5141  194.0085  117.6270  306.7143  500.6455   100  a 

所以,让我们使用修剪。我认为其他解决方案提供了一个很好的解决方案,但不是最好的解决方案。

首先,我们要最小化

其中S_n是前n个元素的累加和。

computeD <- function(p, q, S) {
  n <- length(S)
  S.star <- S[n] / 3
  if (all(p < q)) {
    (S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
  } else {
    stop("You shouldn't be here!")
  }
}

我认为其他解决方案独立优化 p 和 q,这不会给出全局最小值(对于某些特定情况而言是预期的)。

optiCut <- function(v) {
  S <- cumsum(v)
  n <- length(v)
  S_star <- S[n] / 3
  # good starting values
  p_star <- which.min((S - S_star)^2)
  q_star <- which.min((S - 2*S_star)^2)
  print(min <- computeD(p_star, q_star, S))
  
  count <- 0
  for (q in 2:(n-1)) {
    S3 <- S[n] - S[q] - S_star
    if (S3*S3 < min) {
      count <- count + 1
      D <- computeD(seq_len(q - 1), q, S)
      ind = which.min(D);
      if (D[ind] < min) {
        # Update optimal values
        p_star = ind;
        q_star = q;
        min = D[ind];
      }
    }
  }
  c(p_star, q_star, computeD(p_star, q_star, S), count)
}

这与其他解决方案一样快,因为它根据条件 S3*S3 < min 修剪了很多迭代。但是,它给出了最优解,参见 optiCut(c(1, 2, 3, 3, 5, 10)).


对于 K >= 3 的解决方案,我基本上重新实现了带有嵌套 tibbles 的树,这很有趣!

optiCut_K <- function(v, K) {
  
  S <- cumsum(v)
  n <- length(v)
  S_star <- S[n] / K
  # good starting values
  p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
  min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
  
  compute_children <- function(level, ind, val) {
    
    # leaf
    if (level == 1) {
      val <- val + (S[ind] - S_star)^2
      if (val > min_first) {
        return(NULL)
      } else {
        return(val)
      } 
    } 
    
    P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
    inds <- which(P_all < min_first)
    if (length(inds) == 0) return(NULL)
    
    node <- tibble::tibble(
      level = level - 1,
      ind = inds,
      val = P_all[inds]
    )
    node$children <- purrr::pmap(node, compute_children)
    
    node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
    `if`(nrow(node) == 0, NULL, node)
  }
  
  compute_children(K, n, 0)
}

这为您提供了比贪婪的解决方案更好的所有解决方案:

v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)

您需要取消嵌套:

full_unnest <- function(tbl) {
  tmp <- try(tidyr::unnest(tbl), silent = TRUE)
  `if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))

最后,为了获得最佳解决方案:

test[which.min(test$children), ]