条件和上的群向量
Group vector on conditional sum
我想根据小于或等于 n
的元素之和对向量进行分组。假设如下,
set.seed(1)
x <- sample(10, 20, replace = TRUE)
#[1] 3 4 6 10 3 9 10 7 7 1 3 2 7 4 8 5 8 10 4 8
#Where,
n = 15
预期的输出是对总和 <= 15 的值进行分组,即
y <- c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10)
如您所见,总和永远不会大于 15,
sapply(split(x, y), sum)
# 1 2 3 4 5 6 7 8 9 10
#13 13 9 10 15 12 12 13 14 8
注意: 我将在庞大的数据集(通常 > 150 - 200GB)上进行 运行,因此效率是必须的。
我尝试过但接近失败的方法是,
as.integer(cut(cumsum(x), breaks = seq(0, max(cumsum(x)) + 15, 15)))
#[1] 1 1 1 2 2 3 3 4 4 4 5 5 5 6 6 6 7 8 8 8
这可行,但可能会得到改进:
x <- c(3L, 4L, 6L, 10L, 3L, 9L, 10L, 7L, 7L, 1L, 3L, 2L, 7L, 4L, 8L, 5L, 8L, 10L, 4L, 8L)
y <- as.integer(c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10))
n = 15
library(data.table)
DT = data.table(x,y)
DT[, xc := cumsum(x)]
b = DT[.(shift(xc, fill=0) + n + 1), on=.(xc), roll=-Inf, which=TRUE]
z = 1; res = logical(length(x))
while (!is.na(z) && z <= length(x)){
res[z] <- TRUE
z <- b[z]
}
DT[, g := cumsum(res)]
x y xc g
1: 3 1 3 1
2: 4 1 7 1
3: 6 1 13 1
4: 10 2 23 2
5: 3 2 26 2
6: 9 3 35 3
7: 10 4 45 4
8: 7 5 52 5
9: 7 5 59 5
10: 1 5 60 5
11: 3 6 63 6
12: 2 6 65 6
13: 7 6 72 6
14: 4 7 76 7
15: 8 7 84 7
16: 5 8 89 8
17: 8 8 97 8
18: 10 9 107 9
19: 4 9 111 9
20: 8 10 119 10
DT[, all(y == g)] # TRUE
工作原理
滚动连接询问 "if this is the start of a group, where will the next one start?" 然后您可以迭代结果,从第一个位置开始,找到所有组。
最后一行 DT[, g := cumsum(res)]
也可以作为滚动连接完成(也许更快?):
DT[, g := data.table(r = which(res))[, g := .I][.(.I), on=.(r), roll=TRUE, x.g ]]
这是我的 Rcpp
解决方案(接近 解决方案但有点 shorter/stripped 下降),因为你说速度很重要,Rcpp
可能是要走的路:
# create the data
set.seed(1)
x <- sample(10, 20, replace = TRUE)
y <- c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10)
# create the Rcpp function
library(Rcpp)
cppFunction('
IntegerVector sotosGroup(NumericVector x, int cutoff) {
IntegerVector groupVec (x.size());
int group = 1;
double runSum = 0;
for (int i = 0; i < x.size(); i++) {
runSum += x[i];
if (runSum > cutoff) {
group++;
runSum = x[i];
}
groupVec[i] = group;
}
return groupVec;
}
')
# use the function as usual
y_cpp <- sotosGroup(x, 15)
sapply(split(x, y_cpp), sum)
#> 1 2 3 4 5 6 7 8 9 10
#> 13 13 9 10 15 12 12 13 14 8
all.equal(y, y_cpp)
#> [1] TRUE
如果有人需要被速度说服:
# Speed Benchmarks
library(data.table)
library(microbenchmark)
dt <- data.table(x)
frank <- function(DT, n = 15) {
DT[, xc := cumsum(x)]
b = DT[.(shift(xc, fill=0) + n + 1), on=.(xc), roll=-Inf, which=TRUE]
z = 1; res = z
while (!is.na(z))
res <- c(res, z <- b[z])
DT[, g := cumsum(.I %in% res)][]
}
microbenchmark(
frank(dt),
sotosGroup(x, 15),
times = 100
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> frank(dt) 1720.589 1831.320 2148.83096 1878.0725 1981.576 13728.830 100 b
#> sotosGroup(x, 15) 2.595 3.962 6.47038 7.5035 8.290 11.579 100 a
我想根据小于或等于 n
的元素之和对向量进行分组。假设如下,
set.seed(1)
x <- sample(10, 20, replace = TRUE)
#[1] 3 4 6 10 3 9 10 7 7 1 3 2 7 4 8 5 8 10 4 8
#Where,
n = 15
预期的输出是对总和 <= 15 的值进行分组,即
y <- c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10)
如您所见,总和永远不会大于 15,
sapply(split(x, y), sum)
# 1 2 3 4 5 6 7 8 9 10
#13 13 9 10 15 12 12 13 14 8
注意: 我将在庞大的数据集(通常 > 150 - 200GB)上进行 运行,因此效率是必须的。
我尝试过但接近失败的方法是,
as.integer(cut(cumsum(x), breaks = seq(0, max(cumsum(x)) + 15, 15)))
#[1] 1 1 1 2 2 3 3 4 4 4 5 5 5 6 6 6 7 8 8 8
这可行,但可能会得到改进:
x <- c(3L, 4L, 6L, 10L, 3L, 9L, 10L, 7L, 7L, 1L, 3L, 2L, 7L, 4L, 8L, 5L, 8L, 10L, 4L, 8L)
y <- as.integer(c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10))
n = 15
library(data.table)
DT = data.table(x,y)
DT[, xc := cumsum(x)]
b = DT[.(shift(xc, fill=0) + n + 1), on=.(xc), roll=-Inf, which=TRUE]
z = 1; res = logical(length(x))
while (!is.na(z) && z <= length(x)){
res[z] <- TRUE
z <- b[z]
}
DT[, g := cumsum(res)]
x y xc g
1: 3 1 3 1
2: 4 1 7 1
3: 6 1 13 1
4: 10 2 23 2
5: 3 2 26 2
6: 9 3 35 3
7: 10 4 45 4
8: 7 5 52 5
9: 7 5 59 5
10: 1 5 60 5
11: 3 6 63 6
12: 2 6 65 6
13: 7 6 72 6
14: 4 7 76 7
15: 8 7 84 7
16: 5 8 89 8
17: 8 8 97 8
18: 10 9 107 9
19: 4 9 111 9
20: 8 10 119 10
DT[, all(y == g)] # TRUE
工作原理
滚动连接询问 "if this is the start of a group, where will the next one start?" 然后您可以迭代结果,从第一个位置开始,找到所有组。
最后一行 DT[, g := cumsum(res)]
也可以作为滚动连接完成(也许更快?):
DT[, g := data.table(r = which(res))[, g := .I][.(.I), on=.(r), roll=TRUE, x.g ]]
这是我的 Rcpp
解决方案(接近 Rcpp
可能是要走的路:
# create the data
set.seed(1)
x <- sample(10, 20, replace = TRUE)
y <- c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10)
# create the Rcpp function
library(Rcpp)
cppFunction('
IntegerVector sotosGroup(NumericVector x, int cutoff) {
IntegerVector groupVec (x.size());
int group = 1;
double runSum = 0;
for (int i = 0; i < x.size(); i++) {
runSum += x[i];
if (runSum > cutoff) {
group++;
runSum = x[i];
}
groupVec[i] = group;
}
return groupVec;
}
')
# use the function as usual
y_cpp <- sotosGroup(x, 15)
sapply(split(x, y_cpp), sum)
#> 1 2 3 4 5 6 7 8 9 10
#> 13 13 9 10 15 12 12 13 14 8
all.equal(y, y_cpp)
#> [1] TRUE
如果有人需要被速度说服:
# Speed Benchmarks
library(data.table)
library(microbenchmark)
dt <- data.table(x)
frank <- function(DT, n = 15) {
DT[, xc := cumsum(x)]
b = DT[.(shift(xc, fill=0) + n + 1), on=.(xc), roll=-Inf, which=TRUE]
z = 1; res = z
while (!is.na(z))
res <- c(res, z <- b[z])
DT[, g := cumsum(.I %in% res)][]
}
microbenchmark(
frank(dt),
sotosGroup(x, 15),
times = 100
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> frank(dt) 1720.589 1831.320 2148.83096 1878.0725 1981.576 13728.830 100 b
#> sotosGroup(x, 15) 2.595 3.962 6.47038 7.5035 8.290 11.579 100 a