Summary DataFrame:聚合不同类别和一天中特定时间戳的数据
Summary DataFrame: Aggregating data at specific timestamps for different categories and the day overall
我有一个每天更新的大型数据框,其中包含以下内容:
structure(list(date = structure(c(19038, 19038, 19038, 19038,
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038,
19038, 19038), class = "Date"), type = c("USD", "USD", "USD",
"USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",
"USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02,
0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002,
0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105,
0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221,
0.234), time = structure(c(24422, 24430, 24453, 24463, 24468,
24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085,
25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA,
-15L), class = c("data.table", "data.frame"))
这是用于尝试实现结果的代码(由 Dan Adams 提供帮助)
times_of_interest <- c(7, 9, 11)
d %>%
mutate(across(everything(), as.character)) %>%
mutate(across(quantity:cumvol, as.numeric)) %>%
mutate(type = factor(type)) %>%
mutate(date = ymd(date)) %>%
mutate(time = as_hms(time)) %>%
mutate(date = date + sample(0:1, nrow(.), T)) %>%
mutate(time_hr = hour(time)) %>%
filter(time_hr %in% times_of_interest) %>%
group_by(date, type, time_hr) %>%
summarize(cat_total = sum(quantity), .groups = "drop") %>%
group_by(date) %>%
mutate(date_total = sum(cat_total)) %>%
ungroup()
输出结果如下:
structure(list(date = structure(c(19034, 19034, 19034, 19034,
19035, 19035, 19035, 19035, 19035, 19035, 19035, 19035, 19037,
19037, 19037, 19037, 19037, 19037, 19037, 19037, 19037, 19038,
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19039,
19039, 19039), class = "Date"), type = structure(c(3L, 5L, 5L,
5L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 2L, 3L, 3L, 3L, 1L, 5L,
5L, 5L, 2L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 3L, 5L), .Label = c("DNB",
"UKY", "UKS", "T/N cl DBV", "USD"), class = "factor"), time_hr = c(11L,
7L, 9L, 11L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 9L, 11L, 7L,
9L, 11L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L,
11L, 7L, 7L, 7L), cat_total = c(0.739, 8.714, 3.281, 3.263, 0.815,
1.025, 2.071, 0.661, 0.328, 5.633, 0.776, 2.126, 0.465, 0.15,
2.046, 1.203, 0.137, 0.058, 2.341, 4.215, 1.705, 0.01, 0.335,
0.15, 4.323, 1.157, 0.031, 8.607, 3.624, 2.603, 0.865, 1.599,
5.721), date_total = c(9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441)), row.names = c(NA, -33L), class = c("tbl_df", "tbl",
"data.frame"))
我想做的是更改摘要 table 以便它在某些特定时间戳中获取类型(和总体)的总量;每天说 12:00、15:00 和 16:00。
不幸的是,无论出于何种原因,此数据帧的输出总是两次(times_of_interest 向量中的前两次)。/
str 输出为:
Classes ‘data.table’ and 'data.frame': 15 obs. of 5 variables:
$ date : Date, format: "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-15" ...
$ type : chr "USD" "USD" "USD" "USD" ...
$ quantity: num 0.035 0.011 0.02 0.02 0.019 0.036 0.001 0.003 0.004 0.065 ...
$ cumvol : num 0.035 0.046 0.066 0.086 0.105 0.141 0.142 0.145 0.149 0.214 ...
$ time : 'hms' num 06:47:02 06:47:10 06:47:33 06:47:43 ...
..- attr(*, "units")= chr "secs"
- attr(*, ".internal.selfref")=<externalptr>
这是一个使用 {tidyverse} 包的解决方案。如果您只在单独的 vector
中指定您关心的时间戳列表,这是最简单的。我假设您只关心小时,但如果您想要更细化(例如,指定 HH:MM 甚至是 HH:MM:SS),您可以更改代码。然后剩下的就是 group_by()
summarize()
然后 mutate()
添加一个总计的列。
一个问题是您共享的示例数据似乎不包含 times_of_interest
,因此摘要 returns 什么也没有。同样,您正在共享单个日期的数据。在这些情况下,finally 摘要可能不符合您的预期。因此,我首先只是寻找独特的 date/time 组合,以确保我们对输出的外观有一个合理的预期。您也可以使用 table()
来执行此操作。
除了这些问题,下面的代码似乎仍然会根据您共享的数据产生预期的结果。
library(tidyverse)
library(data.table)
library(lubridate)
library(hms)
d <- structure(list(date = structure(c(19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038), class = "Date"), type = c("USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02, 0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002, 0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105, 0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221, 0.234), time = structure(c(24422, 24430, 24453, 24463, 24468, 24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085, 25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA, -15L), class = c("data.table", "data.frame"))
# check for unique times and dates
d %>%
mutate(time_hr = hour(time)) %>%
select(date, time_hr) %>%
distinct()
#> date time_hr
#> 1: 2022-02-15 6
## only one unique date and time_hr in sample data so summary should have 1 line
# set times of interest
# must be present in data to have an output
times_of_interest <- c(6)
d %>%
mutate(time_hr = hour(time)) %>%
filter(time_hr %in% times_of_interest) %>%
group_by(date, type, time_hr) %>%
summarize(cat_total = sum(quantity), .groups = "drop") %>%
group_by(date) %>%
mutate(date_total = sum(cat_total)) %>%
ungroup()
#> # A tibble: 1 x 5
#> date type time_hr cat_total date_total
#> <date> <chr> <int> <dbl> <dbl>
#> 1 2022-02-15 USD 6 0.234 0.234
由 reprex package (v2.0.1)
于 2022-02-15 创建
我有一个每天更新的大型数据框,其中包含以下内容:
structure(list(date = structure(c(19038, 19038, 19038, 19038,
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038,
19038, 19038), class = "Date"), type = c("USD", "USD", "USD",
"USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",
"USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02,
0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002,
0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105,
0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221,
0.234), time = structure(c(24422, 24430, 24453, 24463, 24468,
24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085,
25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA,
-15L), class = c("data.table", "data.frame"))
这是用于尝试实现结果的代码(由 Dan Adams 提供帮助)
times_of_interest <- c(7, 9, 11)
d %>%
mutate(across(everything(), as.character)) %>%
mutate(across(quantity:cumvol, as.numeric)) %>%
mutate(type = factor(type)) %>%
mutate(date = ymd(date)) %>%
mutate(time = as_hms(time)) %>%
mutate(date = date + sample(0:1, nrow(.), T)) %>%
mutate(time_hr = hour(time)) %>%
filter(time_hr %in% times_of_interest) %>%
group_by(date, type, time_hr) %>%
summarize(cat_total = sum(quantity), .groups = "drop") %>%
group_by(date) %>%
mutate(date_total = sum(cat_total)) %>%
ungroup()
输出结果如下:
structure(list(date = structure(c(19034, 19034, 19034, 19034,
19035, 19035, 19035, 19035, 19035, 19035, 19035, 19035, 19037,
19037, 19037, 19037, 19037, 19037, 19037, 19037, 19037, 19038,
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19039,
19039, 19039), class = "Date"), type = structure(c(3L, 5L, 5L,
5L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 2L, 3L, 3L, 3L, 1L, 5L,
5L, 5L, 2L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 3L, 5L), .Label = c("DNB",
"UKY", "UKS", "T/N cl DBV", "USD"), class = "factor"), time_hr = c(11L,
7L, 9L, 11L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 9L, 11L, 7L,
9L, 11L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L,
11L, 7L, 7L, 7L), cat_total = c(0.739, 8.714, 3.281, 3.263, 0.815,
1.025, 2.071, 0.661, 0.328, 5.633, 0.776, 2.126, 0.465, 0.15,
2.046, 1.203, 0.137, 0.058, 2.341, 4.215, 1.705, 0.01, 0.335,
0.15, 4.323, 1.157, 0.031, 8.607, 3.624, 2.603, 0.865, 1.599,
5.721), date_total = c(9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441)), row.names = c(NA, -33L), class = c("tbl_df", "tbl",
"data.frame"))
我想做的是更改摘要 table 以便它在某些特定时间戳中获取类型(和总体)的总量;每天说 12:00、15:00 和 16:00。
不幸的是,无论出于何种原因,此数据帧的输出总是两次(times_of_interest 向量中的前两次)。/
str 输出为:
Classes ‘data.table’ and 'data.frame': 15 obs. of 5 variables:
$ date : Date, format: "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-15" ...
$ type : chr "USD" "USD" "USD" "USD" ...
$ quantity: num 0.035 0.011 0.02 0.02 0.019 0.036 0.001 0.003 0.004 0.065 ...
$ cumvol : num 0.035 0.046 0.066 0.086 0.105 0.141 0.142 0.145 0.149 0.214 ...
$ time : 'hms' num 06:47:02 06:47:10 06:47:33 06:47:43 ...
..- attr(*, "units")= chr "secs"
- attr(*, ".internal.selfref")=<externalptr>
这是一个使用 {tidyverse} 包的解决方案。如果您只在单独的 vector
中指定您关心的时间戳列表,这是最简单的。我假设您只关心小时,但如果您想要更细化(例如,指定 HH:MM 甚至是 HH:MM:SS),您可以更改代码。然后剩下的就是 group_by()
summarize()
然后 mutate()
添加一个总计的列。
一个问题是您共享的示例数据似乎不包含 times_of_interest
,因此摘要 returns 什么也没有。同样,您正在共享单个日期的数据。在这些情况下,finally 摘要可能不符合您的预期。因此,我首先只是寻找独特的 date/time 组合,以确保我们对输出的外观有一个合理的预期。您也可以使用 table()
来执行此操作。
除了这些问题,下面的代码似乎仍然会根据您共享的数据产生预期的结果。
library(tidyverse)
library(data.table)
library(lubridate)
library(hms)
d <- structure(list(date = structure(c(19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038), class = "Date"), type = c("USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02, 0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002, 0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105, 0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221, 0.234), time = structure(c(24422, 24430, 24453, 24463, 24468, 24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085, 25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA, -15L), class = c("data.table", "data.frame"))
# check for unique times and dates
d %>%
mutate(time_hr = hour(time)) %>%
select(date, time_hr) %>%
distinct()
#> date time_hr
#> 1: 2022-02-15 6
## only one unique date and time_hr in sample data so summary should have 1 line
# set times of interest
# must be present in data to have an output
times_of_interest <- c(6)
d %>%
mutate(time_hr = hour(time)) %>%
filter(time_hr %in% times_of_interest) %>%
group_by(date, type, time_hr) %>%
summarize(cat_total = sum(quantity), .groups = "drop") %>%
group_by(date) %>%
mutate(date_total = sum(cat_total)) %>%
ungroup()
#> # A tibble: 1 x 5
#> date type time_hr cat_total date_total
#> <date> <chr> <int> <dbl> <dbl>
#> 1 2022-02-15 USD 6 0.234 0.234
由 reprex package (v2.0.1)
于 2022-02-15 创建