在 data.table 中创建多个引导变量
Create multiple lead variables in data.table
这个问题类似于Creating a bunch of lagged variables in data.table at once and How to create a lag variable within each group?,但据我所知并不完全相同。
我想创建几个主要变量,例如下面的 lead1
、lead2
和 lead3
,按 groups
分组。
示例数据
require(data.table)
set.seed(1)
data <- data.table(time =c(1:10,1:8),groups = c(rep(c("a","b"),c(10,8))),
value = rnorm(18))
data
time groups value
1: 1 a -0.62645381
2: 2 a 0.18364332
3: 3 a -0.83562861
4: 4 a 1.59528080
5: 5 a 0.32950777
6: 6 a -0.82046838
7: 7 a 0.48742905
8: 8 a 0.73832471
9: 9 a 0.57578135
10: 10 a -0.30538839
11: 1 b 1.51178117
12: 2 b 0.38984324
13: 3 b -0.62124058
14: 4 b -2.21469989
15: 5 b 1.12493092
16: 6 b -0.04493361
17: 7 b -0.01619026
18: 8 b 0.94383621
结果数据table应该是
time groups value lead1 lead2 lead3
1 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
2 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
3 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
4 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
5 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
6 6 a -0.82046838 0.48742905 0.73832471 0.57578135
7 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
8 8 a 0.73832471 0.57578135 -0.30538839 NA
9 9 a 0.57578135 -0.30538839 NA NA
10 10 a -0.30538839 NA NA NA
11 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
12 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
13 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
14 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
15 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
16 6 b -0.04493361 -0.01619026 0.94383621 NA
17 7 b -0.01619026 0.94383621 NA NA
18 8 b 0.94383621 NA NA NA
请注意,我的实际数据集要大得多,我可能需要 3 个以上的前导变量。
我正在使用 data.table
版本 1.9.4,不确定何时可以更新到最新版本,因此此版本中的解决方案将是一个奖励。对这个额外的限制感到抱歉。
提前致谢。
您可以使用 dplyr
中的函数 lead
并在与 data.table 的一次调用中完成,如下所示:
library(data.table)
library(dplyr)
data[, c('lead1','lead2','lead3') := list(lead(value, 1), lead(value, 2), lead(value, 3))
, by=groups]
输出
> data
time groups value lead1 lead2 lead3
1: 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
2: 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
3: 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
4: 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
5: 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
6: 6 a -0.82046838 0.48742905 0.73832471 0.57578135
7: 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
8: 8 a 0.73832471 0.57578135 -0.30538839 NA
9: 9 a 0.57578135 -0.30538839 NA NA
10: 10 a -0.30538839 NA NA NA
11: 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
12: 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
13: 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
14: 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
15: 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
16: 6 b -0.04493361 -0.01619026 0.94383621 NA
17: 7 b -0.01619026 0.94383621 NA NA
18: 8 b 0.94383621 NA NA NA
下面的函数将为 leads
参数中指定的所有前导值创建前导列,它只是一个正整数向量。
library(data.table)
lead.n = function(leads, values) {
as.data.frame(sapply(leads, function(n) {
lead.vals = c(values[-c(1:n)], rep(NA,n))
}
))
}
data[, paste0("lead",1:3):=lead.n(1:3,value), by=groups]
time groups value lead1 lead2 lead3
1: 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
2: 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
3: 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
4: 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
5: 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
6: 6 a -0.82046838 0.48742905 0.73832471 0.57578135
7: 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
8: 8 a 0.73832471 0.57578135 -0.30538839 NA
9: 9 a 0.57578135 -0.30538839 NA NA
10: 10 a -0.30538839 NA NA NA
11: 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
12: 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
13: 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
14: 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
15: 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
16: 6 b -0.04493361 -0.01619026 0.94383621 NA
17: 7 b -0.01619026 0.94383621 NA NA
18: 8 b 0.94383621 NA NA NA
标准的 data.table
方法是使用内置的 shift
函数(正如已在链接线程中提到的那样)。为此,您需要 CRAN 上的最新稳定版本 - v 1.9.6+
library(data.table) # V1.9.6+
data[, paste0("lead", 1L:3L) := shift(value, 1L:3L, type = "lead"), by = groups]
data
# time groups value lead1 lead2 lead3
# 1: 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
# 2: 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
# 3: 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
# 4: 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
# 5: 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
# 6: 6 a -0.82046838 0.48742905 0.73832471 0.57578135
# 7: 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
# 8: 8 a 0.73832471 0.57578135 -0.30538839 NA
# 9: 9 a 0.57578135 -0.30538839 NA NA
# 10: 10 a -0.30538839 NA NA NA
# 11: 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
# 12: 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
# 13: 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
# 14: 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
# 15: 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
# 16: 6 b -0.04493361 -0.01619026 0.94383621 NA
# 17: 7 b -0.01619026 0.94383621 NA NA
# 18: 8 b 0.94383621 NA NA NA
这个问题类似于Creating a bunch of lagged variables in data.table at once and How to create a lag variable within each group?,但据我所知并不完全相同。
我想创建几个主要变量,例如下面的 lead1
、lead2
和 lead3
,按 groups
分组。
示例数据
require(data.table)
set.seed(1)
data <- data.table(time =c(1:10,1:8),groups = c(rep(c("a","b"),c(10,8))),
value = rnorm(18))
data
time groups value
1: 1 a -0.62645381
2: 2 a 0.18364332
3: 3 a -0.83562861
4: 4 a 1.59528080
5: 5 a 0.32950777
6: 6 a -0.82046838
7: 7 a 0.48742905
8: 8 a 0.73832471
9: 9 a 0.57578135
10: 10 a -0.30538839
11: 1 b 1.51178117
12: 2 b 0.38984324
13: 3 b -0.62124058
14: 4 b -2.21469989
15: 5 b 1.12493092
16: 6 b -0.04493361
17: 7 b -0.01619026
18: 8 b 0.94383621
结果数据table应该是
time groups value lead1 lead2 lead3
1 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
2 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
3 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
4 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
5 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
6 6 a -0.82046838 0.48742905 0.73832471 0.57578135
7 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
8 8 a 0.73832471 0.57578135 -0.30538839 NA
9 9 a 0.57578135 -0.30538839 NA NA
10 10 a -0.30538839 NA NA NA
11 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
12 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
13 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
14 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
15 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
16 6 b -0.04493361 -0.01619026 0.94383621 NA
17 7 b -0.01619026 0.94383621 NA NA
18 8 b 0.94383621 NA NA NA
请注意,我的实际数据集要大得多,我可能需要 3 个以上的前导变量。
我正在使用 data.table
版本 1.9.4,不确定何时可以更新到最新版本,因此此版本中的解决方案将是一个奖励。对这个额外的限制感到抱歉。
提前致谢。
您可以使用 dplyr
中的函数 lead
并在与 data.table 的一次调用中完成,如下所示:
library(data.table)
library(dplyr)
data[, c('lead1','lead2','lead3') := list(lead(value, 1), lead(value, 2), lead(value, 3))
, by=groups]
输出
> data
time groups value lead1 lead2 lead3
1: 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
2: 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
3: 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
4: 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
5: 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
6: 6 a -0.82046838 0.48742905 0.73832471 0.57578135
7: 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
8: 8 a 0.73832471 0.57578135 -0.30538839 NA
9: 9 a 0.57578135 -0.30538839 NA NA
10: 10 a -0.30538839 NA NA NA
11: 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
12: 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
13: 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
14: 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
15: 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
16: 6 b -0.04493361 -0.01619026 0.94383621 NA
17: 7 b -0.01619026 0.94383621 NA NA
18: 8 b 0.94383621 NA NA NA
下面的函数将为 leads
参数中指定的所有前导值创建前导列,它只是一个正整数向量。
library(data.table)
lead.n = function(leads, values) {
as.data.frame(sapply(leads, function(n) {
lead.vals = c(values[-c(1:n)], rep(NA,n))
}
))
}
data[, paste0("lead",1:3):=lead.n(1:3,value), by=groups]
time groups value lead1 lead2 lead3
1: 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
2: 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
3: 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
4: 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
5: 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
6: 6 a -0.82046838 0.48742905 0.73832471 0.57578135
7: 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
8: 8 a 0.73832471 0.57578135 -0.30538839 NA
9: 9 a 0.57578135 -0.30538839 NA NA
10: 10 a -0.30538839 NA NA NA
11: 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
12: 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
13: 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
14: 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
15: 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
16: 6 b -0.04493361 -0.01619026 0.94383621 NA
17: 7 b -0.01619026 0.94383621 NA NA
18: 8 b 0.94383621 NA NA NA
标准的 data.table
方法是使用内置的 shift
函数(正如已在链接线程中提到的那样)。为此,您需要 CRAN 上的最新稳定版本 - v 1.9.6+
library(data.table) # V1.9.6+
data[, paste0("lead", 1L:3L) := shift(value, 1L:3L, type = "lead"), by = groups]
data
# time groups value lead1 lead2 lead3
# 1: 1 a -0.62645381 0.18364332 -0.83562861 1.59528080
# 2: 2 a 0.18364332 -0.83562861 1.59528080 0.32950777
# 3: 3 a -0.83562861 1.59528080 0.32950777 -0.82046838
# 4: 4 a 1.59528080 0.32950777 -0.82046838 0.48742905
# 5: 5 a 0.32950777 -0.82046838 0.48742905 0.73832471
# 6: 6 a -0.82046838 0.48742905 0.73832471 0.57578135
# 7: 7 a 0.48742905 0.73832471 0.57578135 -0.30538839
# 8: 8 a 0.73832471 0.57578135 -0.30538839 NA
# 9: 9 a 0.57578135 -0.30538839 NA NA
# 10: 10 a -0.30538839 NA NA NA
# 11: 1 b 1.51178117 0.38984324 -0.62124058 -2.21469989
# 12: 2 b 0.38984324 -0.62124058 -2.21469989 1.12493092
# 13: 3 b -0.62124058 -2.21469989 1.12493092 -0.04493361
# 14: 4 b -2.21469989 1.12493092 -0.04493361 -0.01619026
# 15: 5 b 1.12493092 -0.04493361 -0.01619026 0.94383621
# 16: 6 b -0.04493361 -0.01619026 0.94383621 NA
# 17: 7 b -0.01619026 0.94383621 NA NA
# 18: 8 b 0.94383621 NA NA NA