使用 purrr 代替 sapply 计算移动平均线
Calculating moving average with purrr instead sapply
我正在使用 sapply 来计算移动平均值,以及如何使 purrr 解决方案类型安全但失败了。
第一次尝试是
pmap_dbl 但结果错误 - 比较 ra <> purrr_ra1.
第二次尝试是
map_dbl 但产生了错误。
“mutate_impl(.data, dots) 中的错误:
列 purr_ra2
的长度必须为 31(行数)或一,而不是 6"
诸如 zoo 和 RcppRoll 之类的带有滚动/窗口操作的软件包正在考虑 "left"、"right"、"center" 与 window 的对齐方式,但我的情况并非如此情况。
有人可以帮忙吗?
library(tidyverse)
df <- tribble(
~Day, ~val, ~bw, ~fw,
'01-01-2020', 0, 8, 4,
'02-01-2020', 73.5, 8, 4,
'03-01-2020', 540, 8, 4,
'04-01-2020', 0, 8, 4,
'05-01-2020', 57, 8, 4,
'06-01-2020', 20, 8, 4,
'07-01-2020', 690, 8, 4,
'08-01-2020', 40, 8, 4,
'09-01-2020', 38, 8, 4,
'10-01-2020', 60, 8, 4,
'11-01-2020', 0, 8, 4,
'12-01-2020', 40, 8, 4,
'13-01-2020', 40, 8, 4,
'14-01-2020', 225, 8, 4,
'15-01-2020', 77, 8, 4,
'16-01-2020', 0, 8, 4,
'17-01-2020', 153, 8, 4,
'18-01-2020', 950, 8, 4,
'19-01-2020', 124, 8, 4,
'20-01-2020', 80, 8, 4,
'21-01-2020', 0, 8, 4,
'22-01-2020', 80, 8, 4,
'23-01-2020', 766.5, 8, 4,
'24-01-2020', 334, 8, 4,
'25-01-2020', 660, 8, 4,
'26-01-2020', 120, 8, 4,
'27-01-2020', 545, 8, 4,
'28-01-2020', 145, 8, 4,
'29-01-2020', 38.5, 8, 4,
'30-01-2020', 20, 8, 4,
'31-01-2020', 760, 8, 4)
df <- df %>% mutate(Day = as.Date(Day,"%d-%m-%Y"),
fw = as.integer(fw),
bw = as.integer(bw))
df <- df %>% mutate(ra = sapply(seq_along(df$Day), function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
df <- df %>% mutate(purrr_ra1 = pmap_dbl(., function(x,val, Day, fw, bw, ...) mean(val[Day <= Day[x] + fw[x] & Day > Day[x] - bw[x]])))
# df <- df %>% mutate(purrr_ra2 = map_dbl(., function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
实际上 rollapply
是适用的——宽度参数可以是一个偏移向量列表——每行一个,或者如果所有偏移都相同,如问题中那样,则为单分量列表包含一个被回收的偏移向量。
答案1使用单个偏移量向量,适用于题例中每行偏移量相同的情况[=15=]
答案 2 比此处需要的更具普遍性,但如果偏移量因行而异,则将很有用。
答案 3 与第一个答案一样,要求所有行的偏移量都相同。它表明 rollapply 可以在没有 width=list(...)
功能的情况下使用,方法是在输入的两边填充适当数量的 NA。
library(zoo)
# baseline for comparison - from question
ans0 <- sapply(seq_along(df$Day), function(x) {
mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])
})
# 1
ans1 <- rollapply(df$val, list(seq(-7, 4)), mean, partial = TRUE)
# 2
w <- Map(seq, -df$bw + 1, df$fw)
ans2 <- rollapply(df$val, w, mean, partial = TRUE)
# 3
ans3 <- rollapply(c(rep(NA, 7), df$val, rep(NA, 4)), 12, mean, na.rm = TRUE)
identical(ans0, ans1)
## [1] TRUE
identical(ans0, ans2)
## [1] TRUE
identical(ans0, ans3)
## [1] TRUE
注: df
假定为:
df <- structure(list(Day = structure(c(18262, 18263, 18264, 18265,
18266, 18267, 18268, 18269, 18270, 18271, 18272, 18273, 18274,
18275, 18276, 18277, 18278, 18279, 18280, 18281, 18282, 18283,
18284, 18285, 18286, 18287, 18288, 18289, 18290, 18291, 18292
), class = "Date"), val = c(0, 73.5, 540, 0, 57, 20, 690, 40,
38, 60, 0, 40, 40, 225, 77, 0, 153, 950, 124, 80, 0, 80, 766.5,
334, 660, 120, 545, 145, 38.5, 20, 760), bw = c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), fw = c(4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L)), .Names = c("Day",
"val", "bw", "fw"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-31L))
对于这个特定的问题,我们可以利用常量偏移量并使用 tidyquant
获得适当的滞后作为列,然后按行取平均值。
library(tidyquant)
df$ra2 <- df %>%
tq_transmute(val, lag.xts, k = -4:7) %>%
select(-Day) %>%
rowMeans(na.rm = TRUE)
假设 df
如原始问题中所述。对于灵活的偏移量,我喜欢@g-grothendieck
的方法
说实话 - 我在 3 个月前尝试了 zoo 和 rollapply,但 window 失败了。
这是历史,也是标杆。
你的解决方案是
最好的!
library(tidyverse)
library(zoo)
library(microbenchmark)
# df as in the initial coding
# history
# 1. version with lapply
calc_ra = function(x, df) {
begin_date = x - df$bw[df$Day == x]
end_date = x + df$fw[df$Day == x]
res <- df %>% filter(Day > begin_date &
Day <= end_date) %>%
summarize(mv = mean(val))
return(res)
}
ra_lapply <- function(df) {
df <- data.frame(df, ra_lapply = unlist(lapply(df$Day, function(x)
calc_ra(x, df))))
}
# 2. version with zoo 3 month ago
ra_rollapply1 <- function(df){
df <- df %>% mutate(w1 = as.double.difftime(bw - 1 + fw))
df <- df %>% mutate(ra_rollapply1 = rollapply(val, w1, mean, partial = TRUE))
}
# 3. version with sapply
ra_sapply <- function(df){
df <-
df %>% mutate(ra_sapply = sapply(seq_along(df$Day), function(x)
mean(df$val[df$Day <= df$Day[x] + df$fw[x] &
df$Day > df$Day[x] - df$bw[x]])))
}
# 4. version from yesterday
ra_map_dbl <- function(df){
df <- df %>% mutate(ra_map_dbl = map_dbl(seq_along(df$Day), function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
}
# 5. version with zoo from yesterday
ra_rollapply2 <- function(df){
w <- Map(seq, -df$bw + 1, df$fw)
df <- df %>% mutate(ra_rollapply2 = rollapply(val, w, mean, partial = TRUE))
}
df1 <- ra_lapply(df)
df2 <- ra_rollapply1(df1)
df3 <- ra_sapply(df1)
df4 <- ra_map_dbl(df1)
df5 <- ra_rollapply2(df1)
identical(df2$ra_lapply, df2$ra_rollapply1)
[1] FALSE
identical(df3$ra_lapply, df3$ra_sapply)
[1] TRUE
identical(df4$ra_lapply, df4$ra_map_dbl)
[1] TRUE
identical(df5$ra_lapply, df5$ra_rollapply2)
[1] TRUE
res <- microbenchmark(
ra_lapply(df),
ra_rollapply1(df),
ra_sapply(df),
ra_map_dbl(df),
ra_rollapply2(df),
times=1000L)
print(res)
Unit: milliseconds
expr min lq mean median uq max neval
ra_lapply(df) 104.205800 111.077701 119.316653 113.290395 116.749113 287.685832 1000
ra_rollapply1(df) 4.318322 4.606702 5.140784 4.744533 5.017736 17.593661 1000
ra_sapply(df) 15.383019 16.301282 17.992554 16.738366 18.629451 83.400164 1000
ra_map_dbl(df) 15.418707 16.352354 17.965034 16.823075 18.628220 106.660109 1000
ra_rollapply2(df) 2.629061 2.825758 3.229295 2.926465 3.099371 9.891077 1000
我正在使用 sapply 来计算移动平均值,以及如何使 purrr 解决方案类型安全但失败了。
第一次尝试是 pmap_dbl 但结果错误 - 比较 ra <> purrr_ra1.
第二次尝试是 map_dbl 但产生了错误。
“mutate_impl(.data, dots) 中的错误:
列 purr_ra2
的长度必须为 31(行数)或一,而不是 6"
诸如 zoo 和 RcppRoll 之类的带有滚动/窗口操作的软件包正在考虑 "left"、"right"、"center" 与 window 的对齐方式,但我的情况并非如此情况。
有人可以帮忙吗?
library(tidyverse)
df <- tribble(
~Day, ~val, ~bw, ~fw,
'01-01-2020', 0, 8, 4,
'02-01-2020', 73.5, 8, 4,
'03-01-2020', 540, 8, 4,
'04-01-2020', 0, 8, 4,
'05-01-2020', 57, 8, 4,
'06-01-2020', 20, 8, 4,
'07-01-2020', 690, 8, 4,
'08-01-2020', 40, 8, 4,
'09-01-2020', 38, 8, 4,
'10-01-2020', 60, 8, 4,
'11-01-2020', 0, 8, 4,
'12-01-2020', 40, 8, 4,
'13-01-2020', 40, 8, 4,
'14-01-2020', 225, 8, 4,
'15-01-2020', 77, 8, 4,
'16-01-2020', 0, 8, 4,
'17-01-2020', 153, 8, 4,
'18-01-2020', 950, 8, 4,
'19-01-2020', 124, 8, 4,
'20-01-2020', 80, 8, 4,
'21-01-2020', 0, 8, 4,
'22-01-2020', 80, 8, 4,
'23-01-2020', 766.5, 8, 4,
'24-01-2020', 334, 8, 4,
'25-01-2020', 660, 8, 4,
'26-01-2020', 120, 8, 4,
'27-01-2020', 545, 8, 4,
'28-01-2020', 145, 8, 4,
'29-01-2020', 38.5, 8, 4,
'30-01-2020', 20, 8, 4,
'31-01-2020', 760, 8, 4)
df <- df %>% mutate(Day = as.Date(Day,"%d-%m-%Y"),
fw = as.integer(fw),
bw = as.integer(bw))
df <- df %>% mutate(ra = sapply(seq_along(df$Day), function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
df <- df %>% mutate(purrr_ra1 = pmap_dbl(., function(x,val, Day, fw, bw, ...) mean(val[Day <= Day[x] + fw[x] & Day > Day[x] - bw[x]])))
# df <- df %>% mutate(purrr_ra2 = map_dbl(., function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
实际上 rollapply
是适用的——宽度参数可以是一个偏移向量列表——每行一个,或者如果所有偏移都相同,如问题中那样,则为单分量列表包含一个被回收的偏移向量。
答案1使用单个偏移量向量,适用于题例中每行偏移量相同的情况[=15=]
答案 2 比此处需要的更具普遍性,但如果偏移量因行而异,则将很有用。
答案 3 与第一个答案一样,要求所有行的偏移量都相同。它表明 rollapply 可以在没有 width=list(...)
功能的情况下使用,方法是在输入的两边填充适当数量的 NA。
library(zoo)
# baseline for comparison - from question
ans0 <- sapply(seq_along(df$Day), function(x) {
mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])
})
# 1
ans1 <- rollapply(df$val, list(seq(-7, 4)), mean, partial = TRUE)
# 2
w <- Map(seq, -df$bw + 1, df$fw)
ans2 <- rollapply(df$val, w, mean, partial = TRUE)
# 3
ans3 <- rollapply(c(rep(NA, 7), df$val, rep(NA, 4)), 12, mean, na.rm = TRUE)
identical(ans0, ans1)
## [1] TRUE
identical(ans0, ans2)
## [1] TRUE
identical(ans0, ans3)
## [1] TRUE
注: df
假定为:
df <- structure(list(Day = structure(c(18262, 18263, 18264, 18265,
18266, 18267, 18268, 18269, 18270, 18271, 18272, 18273, 18274,
18275, 18276, 18277, 18278, 18279, 18280, 18281, 18282, 18283,
18284, 18285, 18286, 18287, 18288, 18289, 18290, 18291, 18292
), class = "Date"), val = c(0, 73.5, 540, 0, 57, 20, 690, 40,
38, 60, 0, 40, 40, 225, 77, 0, 153, 950, 124, 80, 0, 80, 766.5,
334, 660, 120, 545, 145, 38.5, 20, 760), bw = c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), fw = c(4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L)), .Names = c("Day",
"val", "bw", "fw"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-31L))
对于这个特定的问题,我们可以利用常量偏移量并使用 tidyquant
获得适当的滞后作为列,然后按行取平均值。
library(tidyquant)
df$ra2 <- df %>%
tq_transmute(val, lag.xts, k = -4:7) %>%
select(-Day) %>%
rowMeans(na.rm = TRUE)
假设 df
如原始问题中所述。对于灵活的偏移量,我喜欢@g-grothendieck
说实话 - 我在 3 个月前尝试了 zoo 和 rollapply,但 window 失败了。 这是历史,也是标杆。
你的解决方案是 最好的!
library(tidyverse)
library(zoo)
library(microbenchmark)
# df as in the initial coding
# history
# 1. version with lapply
calc_ra = function(x, df) {
begin_date = x - df$bw[df$Day == x]
end_date = x + df$fw[df$Day == x]
res <- df %>% filter(Day > begin_date &
Day <= end_date) %>%
summarize(mv = mean(val))
return(res)
}
ra_lapply <- function(df) {
df <- data.frame(df, ra_lapply = unlist(lapply(df$Day, function(x)
calc_ra(x, df))))
}
# 2. version with zoo 3 month ago
ra_rollapply1 <- function(df){
df <- df %>% mutate(w1 = as.double.difftime(bw - 1 + fw))
df <- df %>% mutate(ra_rollapply1 = rollapply(val, w1, mean, partial = TRUE))
}
# 3. version with sapply
ra_sapply <- function(df){
df <-
df %>% mutate(ra_sapply = sapply(seq_along(df$Day), function(x)
mean(df$val[df$Day <= df$Day[x] + df$fw[x] &
df$Day > df$Day[x] - df$bw[x]])))
}
# 4. version from yesterday
ra_map_dbl <- function(df){
df <- df %>% mutate(ra_map_dbl = map_dbl(seq_along(df$Day), function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
}
# 5. version with zoo from yesterday
ra_rollapply2 <- function(df){
w <- Map(seq, -df$bw + 1, df$fw)
df <- df %>% mutate(ra_rollapply2 = rollapply(val, w, mean, partial = TRUE))
}
df1 <- ra_lapply(df)
df2 <- ra_rollapply1(df1)
df3 <- ra_sapply(df1)
df4 <- ra_map_dbl(df1)
df5 <- ra_rollapply2(df1)
identical(df2$ra_lapply, df2$ra_rollapply1)
[1] FALSE
identical(df3$ra_lapply, df3$ra_sapply)
[1] TRUE
identical(df4$ra_lapply, df4$ra_map_dbl)
[1] TRUE
identical(df5$ra_lapply, df5$ra_rollapply2)
[1] TRUE
res <- microbenchmark(
ra_lapply(df),
ra_rollapply1(df),
ra_sapply(df),
ra_map_dbl(df),
ra_rollapply2(df),
times=1000L)
print(res)
Unit: milliseconds
expr min lq mean median uq max neval
ra_lapply(df) 104.205800 111.077701 119.316653 113.290395 116.749113 287.685832 1000
ra_rollapply1(df) 4.318322 4.606702 5.140784 4.744533 5.017736 17.593661 1000
ra_sapply(df) 15.383019 16.301282 17.992554 16.738366 18.629451 83.400164 1000
ra_map_dbl(df) 15.418707 16.352354 17.965034 16.823075 18.628220 106.660109 1000
ra_rollapply2(df) 2.629061 2.825758 3.229295 2.926465 3.099371 9.891077 1000