Summary DataFrame:聚合不同类别和一天中特定时间戳的数据

Summary DataFrame: Aggregating data at specific timestamps for different categories and the day overall

我有一个每天更新的大型数据框,其中包含以下内容:

structure(list(date = structure(c(19038, 19038, 19038, 19038, 
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 
19038, 19038), class = "Date"), type = c("USD", "USD", "USD", 
"USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", 
"USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02, 
0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002, 
0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105, 
0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221, 
0.234), time = structure(c(24422, 24430, 24453, 24463, 24468, 
24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085, 
25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA, 
-15L), class = c("data.table", "data.frame"))

这是用于尝试实现结果的代码(由 Dan Adams 提供帮助)

 times_of_interest <- c(7, 9, 11)

d %>%
  mutate(across(everything(), as.character)) %>% 
  mutate(across(quantity:cumvol, as.numeric)) %>% 
  mutate(type = factor(type)) %>% 
  mutate(date = ymd(date)) %>% 
  mutate(time = as_hms(time)) %>% 
  mutate(date = date + sample(0:1, nrow(.), T)) %>% 
  mutate(time_hr = hour(time)) %>% 
  filter(time_hr %in% times_of_interest) %>%
  group_by(date, type, time_hr) %>% 
  summarize(cat_total = sum(quantity), .groups = "drop") %>% 
  group_by(date) %>% 
  mutate(date_total = sum(cat_total)) %>% 
  ungroup()

输出结果如下:

structure(list(date = structure(c(19034, 19034, 19034, 19034, 
19035, 19035, 19035, 19035, 19035, 19035, 19035, 19035, 19037, 
19037, 19037, 19037, 19037, 19037, 19037, 19037, 19037, 19038, 
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19039, 
19039, 19039), class = "Date"), type = structure(c(3L, 5L, 5L, 
5L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 2L, 3L, 3L, 3L, 1L, 5L, 
5L, 5L, 2L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 3L, 5L), .Label = c("DNB", 
"UKY", "UKS", "T/N cl DBV", "USD"), class = "factor"), time_hr = c(11L, 
7L, 9L, 11L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 9L, 11L, 7L, 
9L, 11L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 
11L, 7L, 7L, 7L), cat_total = c(0.739, 8.714, 3.281, 3.263, 0.815, 
1.025, 2.071, 0.661, 0.328, 5.633, 0.776, 2.126, 0.465, 0.15, 
2.046, 1.203, 0.137, 0.058, 2.341, 4.215, 1.705, 0.01, 0.335, 
0.15, 4.323, 1.157, 0.031, 8.607, 3.624, 2.603, 0.865, 1.599, 
5.721), date_total = c(9893.441, 9893.441, 9893.441, 9893.441, 
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 
9893.441)), row.names = c(NA, -33L), class = c("tbl_df", "tbl", 
"data.frame"))

我想做的是更改摘要 table 以便它在某些特定时间戳中获取类型(和总体)的总量;每天说 12:00、15:00 和 16:00。

不幸的是,无论出于何种原因,此数据帧的输出总是两次(times_of_interest 向量中的前两次)。/

str 输出为:

Classes ‘data.table’ and 'data.frame':  15 obs. of  5 variables:
 $ date    : Date, format: "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-15" ...
 $ type    : chr  "USD" "USD" "USD" "USD" ...
 $ quantity: num  0.035 0.011 0.02 0.02 0.019 0.036 0.001 0.003 0.004 0.065 ...
 $ cumvol  : num  0.035 0.046 0.066 0.086 0.105 0.141 0.142 0.145 0.149 0.214 ...
 $ time    : 'hms' num  06:47:02 06:47:10 06:47:33 06:47:43 ...
  ..- attr(*, "units")= chr "secs"
 - attr(*, ".internal.selfref")=<externalptr> 

这是一个使用 {tidyverse} 包的解决方案。如果您只在单独的 vector 中指定您关心的时间戳列表,这是最简单的。我假设您只关心小时,但如果您想要更细化(例如,指定 HH:MM 甚至是 HH:MM:SS),您可以更改代码。然后剩下的就是 group_by() summarize() 然后 mutate() 添加一个总计的列。

一个问题是您共享的示例数据似乎不包含 times_of_interest,因此摘要 returns 什么也没有。同样,您正在共享单个日期的数据。在这些情况下,finally 摘要可能不符合您的预期。因此,我首先只是寻找独特的 date/time 组合,以确保我们对输出的外观有一个合理的预期。您也可以使用 table() 来执行此操作。

除了这些问题,下面的代码似乎仍然会根据您共享的数据产生预期的结果。

library(tidyverse)
library(data.table)
library(lubridate)
library(hms)

d <- structure(list(date = structure(c(19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038), class = "Date"), type = c("USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02, 0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002, 0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105, 0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221, 0.234), time = structure(c(24422, 24430, 24453, 24463, 24468, 24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085, 25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA, -15L), class = c("data.table", "data.frame"))

# check for unique times and dates
d %>% 
  mutate(time_hr = hour(time)) %>% 
  select(date, time_hr) %>% 
  distinct()
#>          date time_hr
#> 1: 2022-02-15       6
## only one unique date and time_hr in sample data so summary should have 1 line

# set times of interest
# must be present in data to have an output
times_of_interest <- c(6)

d %>% 
  mutate(time_hr = hour(time)) %>% 
  filter(time_hr %in% times_of_interest) %>%
  group_by(date, type, time_hr) %>% 
  summarize(cat_total = sum(quantity), .groups = "drop") %>% 
  group_by(date) %>% 
  mutate(date_total = sum(cat_total)) %>% 
  ungroup()
#> # A tibble: 1 x 5
#>   date       type  time_hr cat_total date_total
#>   <date>     <chr>   <int>     <dbl>      <dbl>
#> 1 2022-02-15 USD         6     0.234      0.234

reprex package (v2.0.1)

于 2022-02-15 创建