通过 `do` 平滑每组
Smooth every group via `do`
我有一些数据,下面是其中的一个示例。我的目标是对每一年应用 gam
,并获得另一个值,即来自 gam 模型的预测值。
fertility <- structure(list(AGE = c(15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L
), Year = c(1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930,
1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930,
1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1931,
1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931,
1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931,
1931, 1931, 1931, 1931, 1931, 1931, 1931), fertility = c(5.170284269,
14.18135114, 27.69795144, 44.61216712, 59.08896308, 89.66036496,
105.4563852, 120.1754041, 137.4074262, 148.7159407, 161.5645606,
157.200515, 143.6340251, 127.8855125, 117.7343628, 159.2909484,
126.6158821, 109.0681613, 86.98223678, 70.64470361, 111.0070633,
86.15051988, 68.9204159, 55.92722274, 42.93402958, 56.84376018,
39.35337243, 26.72142573, 18.46207596, 9.231037978, 4.769704534,
13.08261815, 25.55198857, 41.15573626, 54.51090896, 81.99522459,
96.44082973, 109.9015072, 125.6603492, 136.0020892, 148.679958,
144.6639404, 132.1793638, 117.6867783, 108.345172, 144.2820726,
114.68575, 98.79142865, 78.7865069, 63.9883456, 100.217918, 77.77726461,
62.22181169, 50.49147014, 38.76112859, 52.48807067, 36.33789508,
24.67387938, 17.04740757, 8.523703784)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -60L), .Names = c("AGE",
"Year", "fertility"))
所以,非 dplyr,"dumb" 的方法是
count <- 0
for (i in 1930:1931){
count <- count + 1
temp <- filter(fertility, Year == i)
mod <- mgcv::gam(fertility ~ s(AGE), data=temp)
pred[length(15:44) * (count - 1) + 1:30] <- predict(mod, newdata = data.frame(AGE = 15:44))
}
fertility1 <- mutate(fertility, pred = pred)
但我想要 dplyr
中的方法。我的想法是使用 do
为每一列创建一个模型,然后使用 predict
获取值。第一步我可以做,但我很难在 dplyr
:
中实现第二部分
library(mgcv)
library(dplyr)
fertility %>%
#filter(!is.na(fertility)) %>% # not sure if this is necessary
group_by(Year) %>%
dplyr::do(model = mgcv::gam(fertility ~ s(AGE), data = .)) %>%
left_join(fertility, .) %>%
mutate(smoothed = predict(model, newdata = AGE))
我收到错误消息
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "list"
这大概意味着 dplyr
不记得 model
是一个模型,而不仅仅是一个列表元素。
非 dplyr,"smart" 的方法是
do.call(rbind,
lapply(split(fertility, fertility$Year), function(df) {
df$pred <- predict(gam(fertility ~ s(AGE), data=df))
df
}))
参见 ?do.call
、?lapply
和 ?split
。
或者,如果您不喜欢嵌套函数调用:
fertility %>%
split(fertility$Year) %>%
lapply(function(df) {
df$pred <- predict(gam(fertility ~ s(AGE), data=df))
df
}) %>%
do.call(rbind, .)
相同结果使用:
predt=by(fertility[,-2],fertility[,2],function(z){
mod=mgcv::gam(fertility ~ s(AGE), data=z)
pred = predict(mod, newdata = data.frame(AGE = z$AGE))
pred
})
fertility$pred = unlist(predt)
在 do
结果中保留原始 data.frame, 按照@Henrik 的建议:
df %>%
group_by(Year) %>%
do(data.frame(.,pred = predict(gam(fertility ~ s(AGE), data=.))))
将data.table
添加到链中。
require(data.table)
df %>%
data.table %>%
group_by(Year) %>%
mutate(pred = predict(gam(fertility ~ s(AGE))))
没有 data.table
行的 mutate
失败可能与最近 gam
范围界定 mentioned briefly by @GavinSimpson in chat.
的变化有关
聪明的方法是使用在mgcv中已经存在多年的因子-平滑交互作用,要么通过by
s()
中的术语或通过较新的 bs = "fs"
基础类型。以下是您的数据示例:
library("mgcv")
## Make Year a factor
fertility <- transform(fertility, Year = factor(Year))
## Fit model using by terms - include factor as fixed effect too!
mod <- gam(fertility ~ Year + s(AGE, by = Year), data = fertility)
## Plot to see what form this model takes
plot(mod, pages = 1)
## Some prediction data
ages <- with(fertility, seq(min(AGE), max(AGE)))
## Need to replicate this once per Year
pdat <- with(fertility,
data.frame(AGE = rep(ages, nlevels(Year)),
Year = rep(levels(Year), each = length(ages))))
## Add the fitted values to the prediction data
pdat <- transform(pdat, fitted = predict(mod, newdata = pdat))
head(pdat)
> head(pdat)
AGE Year fitted
1 15 1930 -0.8496705
2 16 1930 15.9568574
3 17 1930 33.0754019
4 18 1930 50.7419122
5 19 1930 68.9116594
6 20 1930 87.1306489
但是,如果您只想预测 AGES
:
的观察值,则可以只询问拟合值
fertility <- transform(fertility, fitted = predict(mod))
head(fertility)
> head(fertility)
AGE Year fertility fitted
1 15 1930 5.170284 -0.8496705
2 16 1930 14.181351 15.9568574
3 17 1930 27.697951 33.0754019
4 18 1930 44.612167 50.7419122
5 19 1930 59.088963 68.9116594
6 20 1930 89.660365 87.1306489
具体的factor-smooth basis type bs = "fs"
and ?smooth.terms
and ?factor.smooth.interaction
你也可以看看详情;基本上,如果您有很多级别,但您希望每个级别的平滑器具有相同的平滑参数值,这些都是有效的。
这里的主要优点是您可以使用所有数据并拟合一个模型,然后您可以通过多种方式查询如果您拟合 m 单独的模型,例如能够调查每年平滑器的差异。
我有一些数据,下面是其中的一个示例。我的目标是对每一年应用 gam
,并获得另一个值,即来自 gam 模型的预测值。
fertility <- structure(list(AGE = c(15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L
), Year = c(1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930,
1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930,
1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1931,
1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931,
1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931, 1931,
1931, 1931, 1931, 1931, 1931, 1931, 1931), fertility = c(5.170284269,
14.18135114, 27.69795144, 44.61216712, 59.08896308, 89.66036496,
105.4563852, 120.1754041, 137.4074262, 148.7159407, 161.5645606,
157.200515, 143.6340251, 127.8855125, 117.7343628, 159.2909484,
126.6158821, 109.0681613, 86.98223678, 70.64470361, 111.0070633,
86.15051988, 68.9204159, 55.92722274, 42.93402958, 56.84376018,
39.35337243, 26.72142573, 18.46207596, 9.231037978, 4.769704534,
13.08261815, 25.55198857, 41.15573626, 54.51090896, 81.99522459,
96.44082973, 109.9015072, 125.6603492, 136.0020892, 148.679958,
144.6639404, 132.1793638, 117.6867783, 108.345172, 144.2820726,
114.68575, 98.79142865, 78.7865069, 63.9883456, 100.217918, 77.77726461,
62.22181169, 50.49147014, 38.76112859, 52.48807067, 36.33789508,
24.67387938, 17.04740757, 8.523703784)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -60L), .Names = c("AGE",
"Year", "fertility"))
所以,非 dplyr,"dumb" 的方法是
count <- 0
for (i in 1930:1931){
count <- count + 1
temp <- filter(fertility, Year == i)
mod <- mgcv::gam(fertility ~ s(AGE), data=temp)
pred[length(15:44) * (count - 1) + 1:30] <- predict(mod, newdata = data.frame(AGE = 15:44))
}
fertility1 <- mutate(fertility, pred = pred)
但我想要 dplyr
中的方法。我的想法是使用 do
为每一列创建一个模型,然后使用 predict
获取值。第一步我可以做,但我很难在 dplyr
:
library(mgcv)
library(dplyr)
fertility %>%
#filter(!is.na(fertility)) %>% # not sure if this is necessary
group_by(Year) %>%
dplyr::do(model = mgcv::gam(fertility ~ s(AGE), data = .)) %>%
left_join(fertility, .) %>%
mutate(smoothed = predict(model, newdata = AGE))
我收到错误消息
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "list"
这大概意味着 dplyr
不记得 model
是一个模型,而不仅仅是一个列表元素。
非 dplyr,"smart" 的方法是
do.call(rbind,
lapply(split(fertility, fertility$Year), function(df) {
df$pred <- predict(gam(fertility ~ s(AGE), data=df))
df
}))
参见 ?do.call
、?lapply
和 ?split
。
或者,如果您不喜欢嵌套函数调用:
fertility %>%
split(fertility$Year) %>%
lapply(function(df) {
df$pred <- predict(gam(fertility ~ s(AGE), data=df))
df
}) %>%
do.call(rbind, .)
相同结果使用:
predt=by(fertility[,-2],fertility[,2],function(z){
mod=mgcv::gam(fertility ~ s(AGE), data=z)
pred = predict(mod, newdata = data.frame(AGE = z$AGE))
pred
})
fertility$pred = unlist(predt)
在 do
结果中保留原始 data.frame, 按照@Henrik 的建议:
df %>%
group_by(Year) %>%
do(data.frame(.,pred = predict(gam(fertility ~ s(AGE), data=.))))
将data.table
添加到链中。
require(data.table)
df %>%
data.table %>%
group_by(Year) %>%
mutate(pred = predict(gam(fertility ~ s(AGE))))
没有 data.table
行的 mutate
失败可能与最近 gam
范围界定 mentioned briefly by @GavinSimpson in chat.
聪明的方法是使用在mgcv中已经存在多年的因子-平滑交互作用,要么通过by
s()
中的术语或通过较新的 bs = "fs"
基础类型。以下是您的数据示例:
library("mgcv")
## Make Year a factor
fertility <- transform(fertility, Year = factor(Year))
## Fit model using by terms - include factor as fixed effect too!
mod <- gam(fertility ~ Year + s(AGE, by = Year), data = fertility)
## Plot to see what form this model takes
plot(mod, pages = 1)
## Some prediction data
ages <- with(fertility, seq(min(AGE), max(AGE)))
## Need to replicate this once per Year
pdat <- with(fertility,
data.frame(AGE = rep(ages, nlevels(Year)),
Year = rep(levels(Year), each = length(ages))))
## Add the fitted values to the prediction data
pdat <- transform(pdat, fitted = predict(mod, newdata = pdat))
head(pdat)
> head(pdat)
AGE Year fitted
1 15 1930 -0.8496705
2 16 1930 15.9568574
3 17 1930 33.0754019
4 18 1930 50.7419122
5 19 1930 68.9116594
6 20 1930 87.1306489
但是,如果您只想预测 AGES
:
fertility <- transform(fertility, fitted = predict(mod))
head(fertility)
> head(fertility)
AGE Year fertility fitted
1 15 1930 5.170284 -0.8496705
2 16 1930 14.181351 15.9568574
3 17 1930 27.697951 33.0754019
4 18 1930 44.612167 50.7419122
5 19 1930 59.088963 68.9116594
6 20 1930 89.660365 87.1306489
具体的factor-smooth basis type bs = "fs"
and ?smooth.terms
and ?factor.smooth.interaction
你也可以看看详情;基本上,如果您有很多级别,但您希望每个级别的平滑器具有相同的平滑参数值,这些都是有效的。
这里的主要优点是您可以使用所有数据并拟合一个模型,然后您可以通过多种方式查询如果您拟合 m 单独的模型,例如能够调查每年平滑器的差异。