R:如何在集团层面对日内数据进行重采样?

R: how to resample intraday data at the group level?

考虑以下数据框

time <-c('2016-04-13 23:07:45','2016-04-13 23:07:50','2016-04-13 23:08:45','2016-04-13 23:08:45'
         ,'2016-04-13 23:08:45','2016-04-13 23:07:50','2016-04-13 23:07:51')
group <-c('A','A','A','B','B','B','B')
value<- c(5,10,2,2,NA,1,4)
df<-data.frame(time,group,value)

> df
                 time group value
1 2016-04-13 23:07:45     A     5
2 2016-04-13 23:07:50     A    10
3 2016-04-13 23:08:45     A     2
4 2016-04-13 23:08:45     B     2
5 2016-04-13 23:08:45     B    NA
6 2016-04-13 23:07:50     B     1
7 2016-04-13 23:07:51     B     4

我想在 5 seconds level - group level 重新采样 这个数据帧,并计算 sum value 每个 time-interval - group value.

区间应该是左闭右开。例如,输出的第一行应该是

2016-04-13 23:07:45 A 5 因为第一个 5 秒间隔是 [2016-04-13 23:07:45, 2016-04-13 23:07:50[

我如何在 dplyrdata.table 中做到这一点?我需要为时间戳导入 lubridate 吗?

我想到的最好的主意 data.table:

library(data.table)
setDT(df)
df[, result:={lv=df$group==group; dt=difftime( df$time, time, units="sec"); print(dt); sum(df$value[lv & dt >= 0 & dt < 5],na.rm=TRUE)},by=1:nrow(df)]

输出:

                  time group value result
1: 2016-04-13 23:07:45     A     5      5
2: 2016-04-13 23:07:50     A    10     10
3: 2016-04-13 23:08:45     A     2      2
4: 2016-04-13 23:08:45     B     2      2
5: 2016-04-13 23:08:45     B    NA      2
6: 2016-04-13 23:07:50     B     1      5
7: 2016-04-13 23:07:51     B     4      4

j部分详情:

lv=df$group==group # Create a logical vector to filter at end
dt=abs( difftime( df$time, time, units="sec")) # compute the time difference in seconds between current row and all others
 sum(df$value[lv & dt >= 0 & dt < 5]) # Sum the values where in same group and the difference in seconds is between 0 and 5 secs, 0 included, 5 excluded 

result:={} 允许我们将结果创建为函数调用。 by=1:nrow(df) 使其逐行工作。

并过滤结果以仅获取起点:

> df[,.SD[!duplicated(result)],by=group]
   group                time value result
1:     A 2016-04-13 23:07:45     5      5
2:     A 2016-04-13 23:07:50    10     10
3:     A 2016-04-13 23:08:45     2      2
4:     B 2016-04-13 23:08:45     2      2
5:     B 2016-04-13 23:07:50     1      5
6:     B 2016-04-13 23:07:51     4      4

最新版本 (1.9.8+) data.table:

library(data.table)

# convert to data.table, fix time, add future time
setDT(df)
df[, time := as.POSIXct(time)][, time.5s := time + 5]

# use non-equi join to filter on the required intervals and sum
df[, newval := df[df, on = .(group, time < time.5s, time >= time),
                  sum(value, na.rm = T), by = .EACHI]$V1]
df
#                  time group value             time.5s newval
#1: 2016-04-13 23:07:45     A     5 2016-04-13 23:07:50      5
#2: 2016-04-13 23:07:50     A    10 2016-04-13 23:07:55     10
#3: 2016-04-13 23:08:45     A     2 2016-04-13 23:08:50      2
#4: 2016-04-13 23:08:45     B     2 2016-04-13 23:08:50      2
#5: 2016-04-13 23:08:45     B    NA 2016-04-13 23:08:50      2
#6: 2016-04-13 23:07:50     B     1 2016-04-13 23:07:55      5
#7: 2016-04-13 23:07:51     B     4 2016-04-13 23:07:56      4

如果您愿意为每个组使用单独的数据对象,您可以使用 xts 来解决您的问题,而不是每个组对象 data.table。 xts period.apply 将自动处理您的间隔在左侧关闭但在右侧也打开(这对于将金融报价数据聚合到条形频率非常有帮助。您不会重复计算连续 bars/intervals):

time <-c('2016-04-13 23:07:45','2016-04-13 23:07:55','2016-04-13 23:08:45','2016-04-13 23:08:45'
         ,'2016-04-13 23:08:45','2016-04-13 23:07:50','2016-04-13 23:07:51')
group <-c('A','A','A','B','B','B','B')

value<- c(5,10,2,2,NA,1,4)
df=data.frame(time,group,value)

library(quantmod)
library(lubridate)
df$time = ymd_hms(df$time)

# In this example, model group B object: (You can easily generalise this with a loop or lapply over multiple groups)
df_grp <- df[df$group == "B", ]
x.df_grp <- xts(df_grp$value, order.by = df_grp$time) 
ep <- endpoints(x.df_grp, on = "seconds", k = 5)
# You can replace sum by any useful function.  Pass in extra arguments to period.apply that correspond to FUN, here na.rm = T, to avoid having sum returning NA in your group B row:
x.df_grp_5sec <- period.apply(x.df_grp, ep, FUN = sum, na.rm = TRUE)
# Align timestamps to end of each 5 sec interval by default (helps avoid lookforward bias when merging time series data on different time frequencies):
x.df_grp_5sec <- align.time(x.df_grp_5sec, 5)
# Now record timestamps at start of each 5 sec interval:
.index(x.df_grp_5sec) <- .index(x.df_grp_5sec) - 5

#result:
> x.df_grp_5sec
                    [,1]
2016-04-13 23:07:50    5
2016-04-13 23:08:45    2

这个怎么样:

library(dplyr)
Group5 <- function(myDf) {
    myDf$time <- ymd_hms(myDf$time)
    myDf$timeGroup <- floor_date(myDf$time, unit = "5 seconds")
    summarise(myDf %>% group_by(group, timeGroup), sum(value, na.rm = TRUE))
}

Group5(df)
Source: local data frame [5 x 3]
Groups: group [?]

   group           timeGroup `sum(value, na.rm = TRUE)`
  <fctr>              <dttm>                      <dbl>
1      A 2016-04-13 23:07:45                          5
2      A 2016-04-13 23:07:50                         10
3      A 2016-04-13 23:08:45                          2
4      B 2016-04-13 23:07:50                          5
5      B 2016-04-13 23:08:45                          2

它利用 lubridate 中的 floor_dateymd_hms 将每个日期时间放入正确的组时间中。

这是一个更奇特的例子:

set.seed(500)
time <- ymd_hms('2016-04-13 23:07:45') + sample(-10^3:10^3, 10^5, replace=TRUE)
group <- rep(LETTERS[1:20], each = 5000)
value <- rep(NA, 10^5)
value[sample(10^5, 95000)] <- sample(100, 95000, replace=TRUE)
df2 <- data.frame(time,group,value)

head(df2)
                 time group value
1 2016-04-13 23:18:53     A    53
2 2016-04-13 23:15:15     A    NA
3 2016-04-13 23:23:36     A    40
4 2016-04-13 23:06:40     A    23
5 2016-04-13 23:18:10     A    74
6 2016-04-13 22:57:56     A    65

调用它我们有:

Group5(df2)
Source: local data frame [8,020 x 3]
Groups: group [?]

    group           timeGroup `sum(value, na.rm = TRUE)`
   <fctr>              <dttm>                      <int>
1       A 2016-04-13 22:51:05                        379
2       A 2016-04-13 22:51:10                        646
3       A 2016-04-13 22:51:15                        391
4       A 2016-04-13 22:51:20                       1118
5       A 2016-04-13 22:51:25                        745
6       A 2016-04-13 22:51:30                        546
7       A 2016-04-13 22:51:35                        884
8       A 2016-04-13 22:51:40                        711
9       A 2016-04-13 22:51:45                        526
10      A 2016-04-13 22:51:50                        484
# ... with 8,010 more rows