以动态 n 移动(位置领先/落后的数量)
Shift with dynamic n (number of position lead / lag by)
我有以下 df:
df <- data.table(user = c('a', 'a', 'a', 'b', 'b')
, spend = 1:5
, shift_by = c(1,1,2,1,1)
); df
user spend shift_by
1: a 1 1
2: a 2 1
3: a 3 2
4: b 4 1
5: b 5 1
这次我只想创建超前滞后列,data.table
的 shift
函数中的 n
参数是动态的,并且将 df$shiftby
作为输入。我的预期结果是:
df[, spend_shifted := c(NA, 1, 1, NA, 4)]; df
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 1
3: a 3 2 1
4: b 4 1 NA
5: b 5 1 4
然而,通过以下尝试,它给出了:
df[, spend_shifted := shift(x=spend, n=shift_by, type="lag"), user]; df
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 NA
3: a 3 2 NA
4: b 4 1 NA
5: b 5 1 NA
是我能找到的最接近的例子。但是,由于速度原因,我需要一个分组依据并且正在寻求 data.table
解决方案。真心期待找到任何想法。
我相信这会奏效。您可以在之后删除新的索引列。
df[, newindex := rowid(user) - shift_by]
df[newindex < 0, newindex := 0]
df[newindex > 0, spend_shifted := df[, spend[newindex], by = .(user)]$V1]
# user spend shift_by newindex spend_shifted
# 1: a 1 1 0 NA
# 2: a 2 1 1 1
# 3: a 3 2 1 1
# 4: b 4 1 0 NA
# 5: b 5 1 1 4
使用 data.frames 的矩阵子集:
df[,
spend_shifted :=
data.frame(shift(spend, n = unique(sort(shift_by))))[cbind(1:.N, shift_by)],
by = user]
另一个不带移位的解决方案(除了 Wimpel 的):
df[, {rows <- 1:nrow(.SD) - shift_by; .SD[replace(rows, rows <= 0, NA), spend]},
by = user]
这是另一种方法,使用 data.table 连接。我使用两个辅助列加入:
df[, row := .I, by = .(user)]
df[, match_row := row - shift_by]
df[df, on = .(user, match_row = row), x := i.spend]
df[, c('row', 'match_row') := NULL]
# user spend shift_by spend_shifted x
# 1: a 1 1 NA NA
# 2: a 2 1 1 1
# 3: a 3 2 1 1
# 4: b 4 1 NA NA
# 5: b 5 1 4 4
也许这会有所帮助
> df[, spend_shifted := spend[replace(seq(.N) - shift_by, seq(.N) <= shift_by, NA)], user][]
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 1
3: a 3 2 1
4: b 4 1 NA
5: b 5 1 4
我进行了基准测试,因为可扩展性对我来说非常重要。
df
与原始相同,仅重复 10,000,000。因此,50,000,000 行。
x <- 1e7
df <- data.table(user = rep(c('a', 'a', 'a', 'b', 'b'), x)
, spend = rep(1:5, x)
, shift_by = rep(c(1,1,2,1,1), x)
); df
user spend shift_by
1: a 1 1
2: a 2 1
3: a 3 2
4: b 4 1
5: b 5 1
基准:
a <-
microbenchmark(wimpel = {df[, newindex := rowid(user) - shift_by]
df[newindex < 0, newindex := 0]
df[newindex > 0, spend_shifted := df[, spend[newindex], by = .(user)]$V1]
}
, r2evans = {df[, spend_shifted := spend[{o <- seq_len(.N) - shift_by; o[o<1] <- NA; o; }], by = user]}
, sindri_1 = {df[, spend_shifted := data.frame(shift(spend, n = unique(sort(shift_by))))[cbind(1:.N, shift_by)], by = user]}
, sindri_2 = {df[, {rows <- 1:nrow(.SD) - shift_by; .SD[replace(rows, rows == 0, NA), spend]}, by = user]}
, talat = {df[, row := .I, by = .(user)]
df[, match_row := row - shift_by]
df[df, on = .(user, match_row = row), x := i.spend]
df[, c('row', 'match_row') := NULL]
}
, thomas = {df[, spend_shifted := spend[replace(seq(.N) - shift_by, seq(.N) <= shift_by, NA)], user]}
, times = 20
)
autoplot(a)
@ThomasIsCoding 和@r2evans 的方法几乎相同。
a[, .(mean=mean(time)), expr][order(mean)]]
expr mean
1: thomas 1974759530
2: r2evans 2121604845
3: sindri_2 2530492745
4: wimpel 4337907900
5: sindri_1 4585692780
6: talat 7252938170
我还在解析提供的所有方法的逻辑。我非常感谢你们贡献的方法(其中有很多)。我将在适当的时候对答案进行投票。
我有以下 df:
df <- data.table(user = c('a', 'a', 'a', 'b', 'b')
, spend = 1:5
, shift_by = c(1,1,2,1,1)
); df
user spend shift_by
1: a 1 1
2: a 2 1
3: a 3 2
4: b 4 1
5: b 5 1
这次我只想创建超前滞后列,data.table
的 shift
函数中的 n
参数是动态的,并且将 df$shiftby
作为输入。我的预期结果是:
df[, spend_shifted := c(NA, 1, 1, NA, 4)]; df
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 1
3: a 3 2 1
4: b 4 1 NA
5: b 5 1 4
然而,通过以下尝试,它给出了:
df[, spend_shifted := shift(x=spend, n=shift_by, type="lag"), user]; df
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 NA
3: a 3 2 NA
4: b 4 1 NA
5: b 5 1 NA
data.table
解决方案。真心期待找到任何想法。
我相信这会奏效。您可以在之后删除新的索引列。
df[, newindex := rowid(user) - shift_by]
df[newindex < 0, newindex := 0]
df[newindex > 0, spend_shifted := df[, spend[newindex], by = .(user)]$V1]
# user spend shift_by newindex spend_shifted
# 1: a 1 1 0 NA
# 2: a 2 1 1 1
# 3: a 3 2 1 1
# 4: b 4 1 0 NA
# 5: b 5 1 1 4
使用 data.frames 的矩阵子集:
df[,
spend_shifted :=
data.frame(shift(spend, n = unique(sort(shift_by))))[cbind(1:.N, shift_by)],
by = user]
另一个不带移位的解决方案(除了 Wimpel 的):
df[, {rows <- 1:nrow(.SD) - shift_by; .SD[replace(rows, rows <= 0, NA), spend]},
by = user]
这是另一种方法,使用 data.table 连接。我使用两个辅助列加入:
df[, row := .I, by = .(user)]
df[, match_row := row - shift_by]
df[df, on = .(user, match_row = row), x := i.spend]
df[, c('row', 'match_row') := NULL]
# user spend shift_by spend_shifted x
# 1: a 1 1 NA NA
# 2: a 2 1 1 1
# 3: a 3 2 1 1
# 4: b 4 1 NA NA
# 5: b 5 1 4 4
也许这会有所帮助
> df[, spend_shifted := spend[replace(seq(.N) - shift_by, seq(.N) <= shift_by, NA)], user][]
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 1
3: a 3 2 1
4: b 4 1 NA
5: b 5 1 4
我进行了基准测试,因为可扩展性对我来说非常重要。
df
与原始相同,仅重复 10,000,000。因此,50,000,000 行。
x <- 1e7
df <- data.table(user = rep(c('a', 'a', 'a', 'b', 'b'), x)
, spend = rep(1:5, x)
, shift_by = rep(c(1,1,2,1,1), x)
); df
user spend shift_by
1: a 1 1
2: a 2 1
3: a 3 2
4: b 4 1
5: b 5 1
基准:
a <-
microbenchmark(wimpel = {df[, newindex := rowid(user) - shift_by]
df[newindex < 0, newindex := 0]
df[newindex > 0, spend_shifted := df[, spend[newindex], by = .(user)]$V1]
}
, r2evans = {df[, spend_shifted := spend[{o <- seq_len(.N) - shift_by; o[o<1] <- NA; o; }], by = user]}
, sindri_1 = {df[, spend_shifted := data.frame(shift(spend, n = unique(sort(shift_by))))[cbind(1:.N, shift_by)], by = user]}
, sindri_2 = {df[, {rows <- 1:nrow(.SD) - shift_by; .SD[replace(rows, rows == 0, NA), spend]}, by = user]}
, talat = {df[, row := .I, by = .(user)]
df[, match_row := row - shift_by]
df[df, on = .(user, match_row = row), x := i.spend]
df[, c('row', 'match_row') := NULL]
}
, thomas = {df[, spend_shifted := spend[replace(seq(.N) - shift_by, seq(.N) <= shift_by, NA)], user]}
, times = 20
)
autoplot(a)
@ThomasIsCoding 和@r2evans 的方法几乎相同。
a[, .(mean=mean(time)), expr][order(mean)]]
expr mean
1: thomas 1974759530
2: r2evans 2121604845
3: sindri_2 2530492745
4: wimpel 4337907900
5: sindri_1 4585692780
6: talat 7252938170
我还在解析提供的所有方法的逻辑。我非常感谢你们贡献的方法(其中有很多)。我将在适当的时候对答案进行投票。