按组训练插入符号中的时间序列模型
Train time series models in caret by group
我有如下的数据集
set.seed(503)
foo <- data.table(group = rep(LETTERS[1:6], 150),
y = rnorm(n = 6 * 150, mean = 5, sd = 2),
x1 = rnorm(n = 6 * 150, mean = 5, sd = 10),
x2 = rnorm(n = 6 * 150, mean = 25, sd = 10),
x3 = rnorm(n = 6 * 150, mean = 50, sd = 10),
x4 = rnorm(n = 6 * 150, mean = 0.5, sd = 10),
x5 = sample(c(1, 0), size = 6 * 150, replace = T))
foo[, period := 1:.N, by = group]
问题:我想使用变量 x1, ..., x5
为每个 group
提前预测 y
一步
我想 运行 caret
中的几个模型来决定我将使用哪个。
截至目前,我正在使用时间片运行将其循环
window.length <- 115
timecontrol <- trainControl(method = 'timeslice',
initialWindow = window.length,
horizon = 1,
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final')
model_list <- list()
for(g in unique(foo$group)){
for(model in c("xgbTree", "earth", "cubist")){
dat <- foo[group == g][, c('group', 'period') := NULL]
model_list[[g]][[model]] <- train(y ~ . - 1,
data = dat,
method = model,
trControl = timecontrol)
}
}
但是,我想同时 运行 所有组,使用虚拟变量来识别每个组,比如
dat <- cbind(foo, model.matrix(~ group- 1, foo))
y x1 x2 x3 x4 x5 period groupA groupB groupC groupD groupE groupF
1: 5.710250 11.9615460 22.62916 31.04790 -4.821331e-04 1 1 1 0 0 0 0 0
2: 3.442213 8.6558983 32.41881 45.70801 3.255423e-01 1 1 0 1 0 0 0 0
3: 3.485286 7.7295448 21.99022 56.42133 8.668391e+00 1 1 0 0 1 0 0 0
4: 9.659601 0.9166456 30.34609 55.72661 -7.666063e+00 1 1 0 0 0 1 0 0
5: 5.567950 3.0306864 22.07813 52.21099 5.377153e-01 1 1 0 0 0 0 1 0
但仍然 运行使用 timeslice
以正确的时间顺序排列时间序列。
有没有办法在 trainControl
中声明 time
变量,所以我的 one step ahead
预测使用,在这种情况下,每轮多观察 6 个,并删除前 6 个意见?
我可以通过对数据进行排序并弄乱 horizon
参数来做到这一点(给定 n
组,按时间变量排序并放入 horizon = n
),但这必须如果组数改变,则改变。 initial.window
必须是 time * n_groups
timecontrol <- trainControl(method = 'timeslice',
initialWindow = window.length * length(unique(foo$group)),
horizon = length(unique(foo$group)),
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final')
还有其他方法吗?
我会使用 tidyr::nest()
嵌套组,然后使用 purrr::map()
遍历数据。这种方法更加灵活,因为它可以适应不同的组大小、不同数量的组以及传递给 caret::train()
的变量模型或其他参数。此外,您可以轻松地 运行 使用 furrr
.
并行处理所有内容
加载包并创建数据
我使用 tibble
而不是 data.table
。我也减少了数据的大小。
library(caret)
library(tidyverse)
set.seed(503)
foo <- tibble(
group = rep(LETTERS[1:6], 10),
y = rnorm(n = 6 * 10, mean = 5, sd = 2),
x1 = rnorm(n = 6 * 10, mean = 5, sd = 10),
x2 = rnorm(n = 6 * 10, mean = 25, sd = 10),
x3 = rnorm(n = 6 * 10, mean = 50, sd = 10),
x4 = rnorm(n = 6 * 10, mean = 0.5, sd = 10),
x5 = sample(c(1, 0), size = 6 * 10, replace = T)
) %>%
group_by(group) %>%
mutate(period = row_number()) %>%
ungroup()
减小 initialWindow
尺寸
window.length <- 9
timecontrol <- trainControl(
method = 'timeslice',
initialWindow = window.length,
horizon = 1,
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final'
)
创建一个函数,该函数将 return 拟合模型对象列表
# To fit each model in model_list to data and return model fits as a list.
fit_models <- function(data, model_list, timecontrol) {
map(model_list,
~ train(
y ~ . - 1,
data = data,
method = .x,
trControl = timecontrol
)) %>%
set_names(model_list)
}
适合模特
model_list <- c("xgbTree", "earth", "cubist")
mods <- foo %>%
nest(-group)
mods <- mods %>%
mutate(fits = map(
data,
~ fit_models(
data = .x,
model_list = model_list,
timecontrol = timecontrol
)
))
如果您想查看特定组/模型的结果,您可以执行以下操作:
mods[which(mods$group == "A"), ]$fits[[1]]$xgbTree
使用furrr
进行并行处理
只需用 plan(multiprocess)
初始化 worker 并将 map
更改为 future_map
。请注意,如果您的计算机的处理核心少于 6 个,您可能希望将工作人员数量更改为少于 6 个。
library(furrr)
plan(multiprocess, workers = 6)
mods <- foo %>%
nest(-group)
mods <- mods %>%
mutate(fits = future_map(
data,
~ fit_models(
data = .x,
model_list = model_list,
timecontrol = timecontrol
)
))
我想你要找的答案其实很简单。您可以使用 trainControl()
的 skip
参数在每个 train/test 集之后跳过所需的观察次数。这样每个组周期你只预测一次,同一周期永远不会在训练组和测试组之间分裂,不存在信息泄漏。
使用您提供的示例,如果您设置 skip = 6
和 horizon = 6
(组数),以及 initialWindow = 115
,那么第一个测试集将包括周期的所有组116,下一个测试集将包括周期 117 的所有组,依此类推。
library(caret)
library(tidyverse)
set.seed(503)
foo <- tibble(group = rep(LETTERS[1:6], 150),
y = rnorm(n = 6 * 150, mean = 5, sd = 2),
x1 = rnorm(n = 6 * 150, mean = 5, sd = 10),
x2 = rnorm(n = 6 * 150, mean = 25, sd = 10),
x3 = rnorm(n = 6 * 150, mean = 50, sd = 10),
x4 = rnorm(n = 6 * 150, mean = 0.5, sd = 10),
x5 = sample(c(1, 0), size = 6 * 150, replace = T)) %>%
group_by(group) %>%
mutate(period = row_number()) %>%
ungroup()
dat <- cbind(foo, model.matrix(~ group- 1, foo)) %>%
select(-group)
window.length <- 115
timecontrol <- trainControl(
method = 'timeslice',
initialWindow = window.length * length(unique(foo$group)),
horizon = length(unique(foo$group)),
skip = length(unique(foo$group)),
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final'
)
model_names <- c("xgbTree", "earth", "cubist")
fits <- map(model_names,
~ train(
y ~ . - 1,
data = dat,
method = .x,
trControl = timecontrol
)) %>%
set_names(model_names)
我有如下的数据集
set.seed(503)
foo <- data.table(group = rep(LETTERS[1:6], 150),
y = rnorm(n = 6 * 150, mean = 5, sd = 2),
x1 = rnorm(n = 6 * 150, mean = 5, sd = 10),
x2 = rnorm(n = 6 * 150, mean = 25, sd = 10),
x3 = rnorm(n = 6 * 150, mean = 50, sd = 10),
x4 = rnorm(n = 6 * 150, mean = 0.5, sd = 10),
x5 = sample(c(1, 0), size = 6 * 150, replace = T))
foo[, period := 1:.N, by = group]
问题:我想使用变量 x1, ..., x5
group
提前预测 y
一步
我想 运行 caret
中的几个模型来决定我将使用哪个。
截至目前,我正在使用时间片运行将其循环
window.length <- 115
timecontrol <- trainControl(method = 'timeslice',
initialWindow = window.length,
horizon = 1,
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final')
model_list <- list()
for(g in unique(foo$group)){
for(model in c("xgbTree", "earth", "cubist")){
dat <- foo[group == g][, c('group', 'period') := NULL]
model_list[[g]][[model]] <- train(y ~ . - 1,
data = dat,
method = model,
trControl = timecontrol)
}
}
但是,我想同时 运行 所有组,使用虚拟变量来识别每个组,比如
dat <- cbind(foo, model.matrix(~ group- 1, foo))
y x1 x2 x3 x4 x5 period groupA groupB groupC groupD groupE groupF
1: 5.710250 11.9615460 22.62916 31.04790 -4.821331e-04 1 1 1 0 0 0 0 0
2: 3.442213 8.6558983 32.41881 45.70801 3.255423e-01 1 1 0 1 0 0 0 0
3: 3.485286 7.7295448 21.99022 56.42133 8.668391e+00 1 1 0 0 1 0 0 0
4: 9.659601 0.9166456 30.34609 55.72661 -7.666063e+00 1 1 0 0 0 1 0 0
5: 5.567950 3.0306864 22.07813 52.21099 5.377153e-01 1 1 0 0 0 0 1 0
但仍然 运行使用 timeslice
以正确的时间顺序排列时间序列。
有没有办法在 trainControl
中声明 time
变量,所以我的 one step ahead
预测使用,在这种情况下,每轮多观察 6 个,并删除前 6 个意见?
我可以通过对数据进行排序并弄乱 horizon
参数来做到这一点(给定 n
组,按时间变量排序并放入 horizon = n
),但这必须如果组数改变,则改变。 initial.window
必须是 time * n_groups
timecontrol <- trainControl(method = 'timeslice',
initialWindow = window.length * length(unique(foo$group)),
horizon = length(unique(foo$group)),
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final')
还有其他方法吗?
我会使用 tidyr::nest()
嵌套组,然后使用 purrr::map()
遍历数据。这种方法更加灵活,因为它可以适应不同的组大小、不同数量的组以及传递给 caret::train()
的变量模型或其他参数。此外,您可以轻松地 运行 使用 furrr
.
加载包并创建数据
我使用 tibble
而不是 data.table
。我也减少了数据的大小。
library(caret)
library(tidyverse)
set.seed(503)
foo <- tibble(
group = rep(LETTERS[1:6], 10),
y = rnorm(n = 6 * 10, mean = 5, sd = 2),
x1 = rnorm(n = 6 * 10, mean = 5, sd = 10),
x2 = rnorm(n = 6 * 10, mean = 25, sd = 10),
x3 = rnorm(n = 6 * 10, mean = 50, sd = 10),
x4 = rnorm(n = 6 * 10, mean = 0.5, sd = 10),
x5 = sample(c(1, 0), size = 6 * 10, replace = T)
) %>%
group_by(group) %>%
mutate(period = row_number()) %>%
ungroup()
减小 initialWindow
尺寸
window.length <- 9
timecontrol <- trainControl(
method = 'timeslice',
initialWindow = window.length,
horizon = 1,
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final'
)
创建一个函数,该函数将 return 拟合模型对象列表
# To fit each model in model_list to data and return model fits as a list.
fit_models <- function(data, model_list, timecontrol) {
map(model_list,
~ train(
y ~ . - 1,
data = data,
method = .x,
trControl = timecontrol
)) %>%
set_names(model_list)
}
适合模特
model_list <- c("xgbTree", "earth", "cubist")
mods <- foo %>%
nest(-group)
mods <- mods %>%
mutate(fits = map(
data,
~ fit_models(
data = .x,
model_list = model_list,
timecontrol = timecontrol
)
))
如果您想查看特定组/模型的结果,您可以执行以下操作:
mods[which(mods$group == "A"), ]$fits[[1]]$xgbTree
使用furrr
进行并行处理
只需用 plan(multiprocess)
初始化 worker 并将 map
更改为 future_map
。请注意,如果您的计算机的处理核心少于 6 个,您可能希望将工作人员数量更改为少于 6 个。
library(furrr)
plan(multiprocess, workers = 6)
mods <- foo %>%
nest(-group)
mods <- mods %>%
mutate(fits = future_map(
data,
~ fit_models(
data = .x,
model_list = model_list,
timecontrol = timecontrol
)
))
我想你要找的答案其实很简单。您可以使用 trainControl()
的 skip
参数在每个 train/test 集之后跳过所需的观察次数。这样每个组周期你只预测一次,同一周期永远不会在训练组和测试组之间分裂,不存在信息泄漏。
使用您提供的示例,如果您设置 skip = 6
和 horizon = 6
(组数),以及 initialWindow = 115
,那么第一个测试集将包括周期的所有组116,下一个测试集将包括周期 117 的所有组,依此类推。
library(caret)
library(tidyverse)
set.seed(503)
foo <- tibble(group = rep(LETTERS[1:6], 150),
y = rnorm(n = 6 * 150, mean = 5, sd = 2),
x1 = rnorm(n = 6 * 150, mean = 5, sd = 10),
x2 = rnorm(n = 6 * 150, mean = 25, sd = 10),
x3 = rnorm(n = 6 * 150, mean = 50, sd = 10),
x4 = rnorm(n = 6 * 150, mean = 0.5, sd = 10),
x5 = sample(c(1, 0), size = 6 * 150, replace = T)) %>%
group_by(group) %>%
mutate(period = row_number()) %>%
ungroup()
dat <- cbind(foo, model.matrix(~ group- 1, foo)) %>%
select(-group)
window.length <- 115
timecontrol <- trainControl(
method = 'timeslice',
initialWindow = window.length * length(unique(foo$group)),
horizon = length(unique(foo$group)),
skip = length(unique(foo$group)),
selectionFunction = "best",
fixedWindow = TRUE,
savePredictions = 'final'
)
model_names <- c("xgbTree", "earth", "cubist")
fits <- map(model_names,
~ train(
y ~ . - 1,
data = dat,
method = .x,
trControl = timecontrol
)) %>%
set_names(model_names)