快速矩阵子集计算
Fast matrix subset computation
我有一个大约一百万行的数据集,按 id、开始(一些 id 有多个起点)和年份排序,我想计算这两个数据的 5 年平均值(从 start-5 开始)每个 id 中的变量(var1 和 var2)。
例如,var1 中的 5 年平均值为 243.2=(47+99+1000+60+10)/5 和 46=(133+13+88-50)/4(4 年由于数据范围限制的平均值)分别为 id==1 和 id==2。是否有以下代码的快速替代方案?
示例数据:
id start year var1 var2
1 2005 2000 500 333
1 2005 2001 10 444
1 2005 2002 60 555
1 2005 2003 1000 99
1 2005 2004 99 15
1 2005 2005 47 0
1 2005 2006 180 NA
2 2003 2000 -50 NA
2 2003 2001 88 17
2 2003 2002 13 77
2 2003 2003 133 55
2 2003 2004 86 30
2 2003 2005 10 100
代码:
# Find startpoint per id
idx <- which(year==start)
# Compute
sapply(idx, function(x){
with( dat, c(id[x],
start[x],
mean( var1[id==id[x] & (year>=max(2000,year[x]-4) & year<=year[x])], na.rm=T ),
mean( var2[id==id[x] & (year>=max(2000,year[x]-4) & year<=year[x])], na.rm=T )) )
})
根据以下已接受的解决方案调整版本:
data <- setDT(data)[, .(var1_avg5 = mean(var1[year > start-5 & year <= start], na.rm = T),
var2_avg5 = mean(var2[year > start-5 & year <= start], na.rm = T),
start,
year),
by=id]
考虑通过tapply
传递的zoo
中的rollmean
(著名的时间序列包):
library(zoo)
...
df$var1_five_yr_avg <- with(df, unlist(tapply(var1, id, function(x) rollmeanr(x, k=5, fill=NA))))
df$var2_five_yr_avg <- with(df, unlist(tapply(var2, id, function(x) rollmeanr(x, k=5, fill=NA))))
df
# id start year var1 var2 var1_five_yr_avg var2_five_yr_avg
# 1 1 2005 2000 500 333 NA NA
# 2 1 2005 2001 10 444 NA NA
# 3 1 2005 2002 60 555 NA NA
# 4 1 2005 2003 1000 99 NA NA
# 5 1 2005 2004 99 15 333.8 289.2
# 6 1 2005 2005 47 0 243.2 222.6
# 7 1 2005 2006 180 NA 277.2 NA
# 8 2 2003 2000 -50 NA NA NA
# 9 2 2003 2001 88 17 NA NA
# 10 2 2003 2002 13 77 NA NA
# 11 2 2003 2003 133 55 NA NA
# 12 2 2003 2004 86 30 54.0 NA
# 13 2 2003 2005 10 100 66.0 55.8
但是,您表示更动态地需要 运行 多种滚动方式,具体取决于数据可用性。因此,考虑 运行 具有 ifelse
逻辑的多重滚动方式。
proc_rollmeans <- function(x) {
five_yr <- rollmeanr(x, k=5, fill=NA)
four_yr <- rollmeanr(x, k=4, fill=NA)
three_yr <- rollmeanr(x, k=3, fill=NA)
two_yr <- rollmeanr(x, k=2, fill=NA)
one_yr <- x
ifelse(!is.na(five_yr), five_yr,
ifelse(!is.na(four_yr), four_yr,
ifelse(!is.na(three_yr), three_yr,
ifelse(!is.na(two_yr), two_yr, one_yr)
)
)
)
}
df$var1_five_yr_avg <- with(df, unlist(tapply(var1, id, proc_rollmeans)))
df$var2_five_yr_avg <- with(df, unlist(tapply(var2, id, proc_rollmeans)))
df
# id start year var1 var2 var1_five_yr_avg var2_five_yr_avg
# 1 1 2005 2000 500 333 500.0 333.00000
# 2 1 2005 2001 10 444 255.0 388.50000
# 3 1 2005 2002 60 555 190.0 444.00000
# 4 1 2005 2003 1000 99 392.5 357.75000
# 5 1 2005 2004 99 15 333.8 289.20000
# 6 1 2005 2005 47 0 243.2 222.60000
# 7 1 2005 2006 180 NA 277.2 NA
# 8 2 2003 2000 -50 NA -50.0 NA
# 9 2 2003 2001 88 17 19.0 17.00000
# 10 2 2003 2002 13 77 17.0 47.00000
# 11 2 2003 2003 133 55 46.0 49.66667
# 12 2 2003 2004 86 30 54.0 44.75000
# 13 2 2003 2005 10 100 66.0 55.80000
这是你想要的吗?
library(data.table)
# data simulation
n = 7e6
data = data.table(
id = sample(seq(1,n / 7), n, replace = TRUE),
year = sample(seq(2000, 2010), n, replace = TRUE),
var1 = rnorm(n),
var2 = rexp(n)
)
data[, start := max(year) - sample(c(1,2), 1), id]
# calculation
t1 = Sys.time()
data = data[year > start - 5 & year <= start]
data[, .(var1 = mean(var1, na.rm = T),
var2 = mean(var2, na.rm = T)), id]
t2 = Sys.time()
print(t2 - t1)
Time difference of 0.511766 secs
我有一个大约一百万行的数据集,按 id、开始(一些 id 有多个起点)和年份排序,我想计算这两个数据的 5 年平均值(从 start-5 开始)每个 id 中的变量(var1 和 var2)。
例如,var1 中的 5 年平均值为 243.2=(47+99+1000+60+10)/5 和 46=(133+13+88-50)/4(4 年由于数据范围限制的平均值)分别为 id==1 和 id==2。是否有以下代码的快速替代方案?
示例数据:
id start year var1 var2
1 2005 2000 500 333
1 2005 2001 10 444
1 2005 2002 60 555
1 2005 2003 1000 99
1 2005 2004 99 15
1 2005 2005 47 0
1 2005 2006 180 NA
2 2003 2000 -50 NA
2 2003 2001 88 17
2 2003 2002 13 77
2 2003 2003 133 55
2 2003 2004 86 30
2 2003 2005 10 100
代码:
# Find startpoint per id
idx <- which(year==start)
# Compute
sapply(idx, function(x){
with( dat, c(id[x],
start[x],
mean( var1[id==id[x] & (year>=max(2000,year[x]-4) & year<=year[x])], na.rm=T ),
mean( var2[id==id[x] & (year>=max(2000,year[x]-4) & year<=year[x])], na.rm=T )) )
})
根据以下已接受的解决方案调整版本:
data <- setDT(data)[, .(var1_avg5 = mean(var1[year > start-5 & year <= start], na.rm = T),
var2_avg5 = mean(var2[year > start-5 & year <= start], na.rm = T),
start,
year),
by=id]
考虑通过tapply
传递的zoo
中的rollmean
(著名的时间序列包):
library(zoo)
...
df$var1_five_yr_avg <- with(df, unlist(tapply(var1, id, function(x) rollmeanr(x, k=5, fill=NA))))
df$var2_five_yr_avg <- with(df, unlist(tapply(var2, id, function(x) rollmeanr(x, k=5, fill=NA))))
df
# id start year var1 var2 var1_five_yr_avg var2_five_yr_avg
# 1 1 2005 2000 500 333 NA NA
# 2 1 2005 2001 10 444 NA NA
# 3 1 2005 2002 60 555 NA NA
# 4 1 2005 2003 1000 99 NA NA
# 5 1 2005 2004 99 15 333.8 289.2
# 6 1 2005 2005 47 0 243.2 222.6
# 7 1 2005 2006 180 NA 277.2 NA
# 8 2 2003 2000 -50 NA NA NA
# 9 2 2003 2001 88 17 NA NA
# 10 2 2003 2002 13 77 NA NA
# 11 2 2003 2003 133 55 NA NA
# 12 2 2003 2004 86 30 54.0 NA
# 13 2 2003 2005 10 100 66.0 55.8
但是,您表示更动态地需要 运行 多种滚动方式,具体取决于数据可用性。因此,考虑 运行 具有 ifelse
逻辑的多重滚动方式。
proc_rollmeans <- function(x) {
five_yr <- rollmeanr(x, k=5, fill=NA)
four_yr <- rollmeanr(x, k=4, fill=NA)
three_yr <- rollmeanr(x, k=3, fill=NA)
two_yr <- rollmeanr(x, k=2, fill=NA)
one_yr <- x
ifelse(!is.na(five_yr), five_yr,
ifelse(!is.na(four_yr), four_yr,
ifelse(!is.na(three_yr), three_yr,
ifelse(!is.na(two_yr), two_yr, one_yr)
)
)
)
}
df$var1_five_yr_avg <- with(df, unlist(tapply(var1, id, proc_rollmeans)))
df$var2_five_yr_avg <- with(df, unlist(tapply(var2, id, proc_rollmeans)))
df
# id start year var1 var2 var1_five_yr_avg var2_five_yr_avg
# 1 1 2005 2000 500 333 500.0 333.00000
# 2 1 2005 2001 10 444 255.0 388.50000
# 3 1 2005 2002 60 555 190.0 444.00000
# 4 1 2005 2003 1000 99 392.5 357.75000
# 5 1 2005 2004 99 15 333.8 289.20000
# 6 1 2005 2005 47 0 243.2 222.60000
# 7 1 2005 2006 180 NA 277.2 NA
# 8 2 2003 2000 -50 NA -50.0 NA
# 9 2 2003 2001 88 17 19.0 17.00000
# 10 2 2003 2002 13 77 17.0 47.00000
# 11 2 2003 2003 133 55 46.0 49.66667
# 12 2 2003 2004 86 30 54.0 44.75000
# 13 2 2003 2005 10 100 66.0 55.80000
这是你想要的吗?
library(data.table)
# data simulation
n = 7e6
data = data.table(
id = sample(seq(1,n / 7), n, replace = TRUE),
year = sample(seq(2000, 2010), n, replace = TRUE),
var1 = rnorm(n),
var2 = rexp(n)
)
data[, start := max(year) - sample(c(1,2), 1), id]
# calculation
t1 = Sys.time()
data = data[year > start - 5 & year <= start]
data[, .(var1 = mean(var1, na.rm = T),
var2 = mean(var2, na.rm = T)), id]
t2 = Sys.time()
print(t2 - t1)
Time difference of 0.511766 secs