使用进出时间记录每15分钟统计客户
count customers in every 15 minutes using entry exit time record
您好,我有商店顾客进出时间。我想从早上 6 点到晚上 10 点每隔 15 分钟统计一次店内顾客的数量。
谢谢
library(lubridate)
library(tidyverse)
entry_time <- ymd_hms("2021/11/9 6:00:00") + + runif(n=20, min=0, max = 14 * 3600)
exit_time <- entry_time + runif(n=20, min=0, max = 5 * 3600)
# tibble
tbl <- tibble(entry_time, exit_time) %>%
arrange(entry_time) %>%
mutate(customerID = fct_rev(factor(1:20)))
# for visualization ---------------------------------------------------------
tbl_longer <- tbl %>%
pivot_longer(cols = ends_with("time"),
names_to = "time",
values_to = "value")
ggplot(tbl_longer,
aes(x = customerID,
y = value)) +
geom_line(size = 3) +
coord_flip() +
theme_bw() +
theme(aspect.ratio = 0.7)
这里有一个 tidyverse 方法可以做到这一点不是基于范围的连接,但它有一些限制:
- 它与已知访问相符,因此如果您想要特定的时间范围(即,从早上 6 点开始),那么您需要将这些值从外部引入
0
- 它使用
lubridate::floor_date
并将所有 entry/exit 时间截断为之前的 15 分钟间隔;我认为这足以满足您的需求,但请进行一些检查以确保所有客户都已入账
library(dplyr)
library(purrr) # map2
library(lubridate) # floor_date
tbl %>%
mutate(times = purrr::map2(entry_time, exit_time, ~ seq(floor_date(.x, unit="15 mins"), floor_date(.y, unit="15 mins"), by="15 mins"))) %>%
select(times) %>%
unnest(times) %>%
group_by(times) %>%
tally()
# # A tibble: 64 x 2
# times n
# <dttm> <int>
# 1 2021-11-09 07:30:00 1
# 2 2021-11-09 07:45:00 2
# 3 2021-11-09 08:00:00 2
# 4 2021-11-09 08:15:00 2
# 5 2021-11-09 08:30:00 2
# 6 2021-11-09 08:45:00 1
# 7 2021-11-09 09:00:00 1
# 8 2021-11-09 09:15:00 1
# 9 2021-11-09 09:30:00 2
# 10 2021-11-09 09:45:00 2
# # ... with 54 more rows
这也可以通过基于范围的联接来完成。 dplyr
不直接支持,在 dplyr
中进行这些连接的方法往往效率低下;我在这里推荐 sqldf
,尽管 data.table
和 fuzzyjoin
解决方案也存在(参见 Merging two data frames by time range in R)。
可重现的数据:
library(dplyr)
library(lubridate)
set.seed(42)
entry_time <- ymd_hms("2021/11/9 6:00:00") + runif(n=20, min=0, max = 14 * 3600)
exit_time <- entry_time + runif(n=20, min=0, max = 5 * 3600)
tbl <- tibble(entry_time, exit_time) %>%
arrange(entry_time) %>%
mutate(customerID = as.character(1:20))
tbl
# # A tibble: 20 x 3
# entry_time exit_time customerID
# <dttm> <dttm> <chr>
# 1 2021-11-09 07:38:41 2021-11-09 08:40:59 1
# 2 2021-11-09 07:53:07 2021-11-09 12:24:50 2
# 3 2021-11-09 09:34:33 2021-11-09 13:00:06 3
# 4 2021-11-09 10:00:21 2021-11-09 14:57:01 4
# 5 2021-11-09 12:24:30 2021-11-09 16:05:46 5
# 6 2021-11-09 12:28:19 2021-11-09 12:29:30 6
# 7 2021-11-09 12:38:59 2021-11-09 17:10:58 7
# 8 2021-11-09 13:16:02 2021-11-09 15:50:18 8
# 9 2021-11-09 13:50:40 2021-11-09 16:54:12 9
# 10 2021-11-09 14:59:03 2021-11-09 15:23:47 10
# 11 2021-11-09 15:11:52 2021-11-09 17:25:57 11
# 12 2021-11-09 15:52:15 2021-11-09 20:03:03 12
# 13 2021-11-09 16:04:03 2021-11-09 20:07:22 13
# 14 2021-11-09 16:18:44 2021-11-09 18:15:47 14
# 15 2021-11-09 17:37:34 2021-11-09 22:21:34 15
# 16 2021-11-09 18:48:26 2021-11-09 23:19:38 16
# 17 2021-11-09 19:05:07 2021-11-09 21:01:33 17
# 18 2021-11-09 19:07:08 2021-11-09 19:48:45 18
# 19 2021-11-09 19:09:36 2021-11-09 23:19:29 19
# 20 2021-11-09 19:41:42 2021-11-09 19:43:54 20
我们将从开始时间和结束时间开始输出帧(我刚刚经过最后一位顾客,您可以使用文字代码将其控制为“晚上 10 点”)。
times <- seq(ymd_hms("2021/11/9 6:00:00"), max(tbl$exit_time) + 30*60, by="15 mins")
times <- tibble(start_time = times[-length(times)], end_time = times[-1])
times
# # A tibble: 71 x 2
# start_time end_time
# <dttm> <dttm>
# 1 2021-11-09 06:00:00 2021-11-09 06:15:00
# 2 2021-11-09 06:15:00 2021-11-09 06:30:00
# 3 2021-11-09 06:30:00 2021-11-09 06:45:00
# 4 2021-11-09 06:45:00 2021-11-09 07:00:00
# 5 2021-11-09 07:00:00 2021-11-09 07:15:00
# 6 2021-11-09 07:15:00 2021-11-09 07:30:00
# 7 2021-11-09 07:30:00 2021-11-09 07:45:00
# 8 2021-11-09 07:45:00 2021-11-09 08:00:00
# 9 2021-11-09 08:00:00 2021-11-09 08:15:00
# 10 2021-11-09 08:15:00 2021-11-09 08:30:00
# # ... with 61 more rows
这是解决方案及其(截断的)输出:
sqldf::sqldf("
select t.start_time, t.end_time, count(*) as n_customers
from times t
left join tbl on (
tbl.entry_time between t.start_time and t.end_time
or tbl.exit_time between t.start_time and t.end_time
or (tbl.entry_time < t.start_time and tbl.exit_time > t.end_time)
)
where tbl.customerid is not null
group by t.start_time, t.end_time") %>%
mutate(across(c(start_time, end_time), ~ `attr<-`(., "tzone", "UTC"))) %>%
head(.)
# start_time end_time n_customers
# 1 2021-11-09 07:30:00 2021-11-09 07:45:00 1
# 2 2021-11-09 07:45:00 2021-11-09 08:00:00 2
# 3 2021-11-09 08:00:00 2021-11-09 08:15:00 2
# 4 2021-11-09 08:15:00 2021-11-09 08:30:00 2
# 5 2021-11-09 08:30:00 2021-11-09 08:45:00 2
# 6 2021-11-09 08:45:00 2021-11-09 09:00:00 1
# ...
(诚然,这里修改了TZ,但重置起来并不难。)
您好,我有商店顾客进出时间。我想从早上 6 点到晚上 10 点每隔 15 分钟统计一次店内顾客的数量。
谢谢
library(lubridate)
library(tidyverse)
entry_time <- ymd_hms("2021/11/9 6:00:00") + + runif(n=20, min=0, max = 14 * 3600)
exit_time <- entry_time + runif(n=20, min=0, max = 5 * 3600)
# tibble
tbl <- tibble(entry_time, exit_time) %>%
arrange(entry_time) %>%
mutate(customerID = fct_rev(factor(1:20)))
# for visualization ---------------------------------------------------------
tbl_longer <- tbl %>%
pivot_longer(cols = ends_with("time"),
names_to = "time",
values_to = "value")
ggplot(tbl_longer,
aes(x = customerID,
y = value)) +
geom_line(size = 3) +
coord_flip() +
theme_bw() +
theme(aspect.ratio = 0.7)
这里有一个 tidyverse 方法可以做到这一点不是基于范围的连接,但它有一些限制:
- 它与已知访问相符,因此如果您想要特定的时间范围(即,从早上 6 点开始),那么您需要将这些值从外部引入
0
- 它使用
lubridate::floor_date
并将所有 entry/exit 时间截断为之前的 15 分钟间隔;我认为这足以满足您的需求,但请进行一些检查以确保所有客户都已入账
library(dplyr)
library(purrr) # map2
library(lubridate) # floor_date
tbl %>%
mutate(times = purrr::map2(entry_time, exit_time, ~ seq(floor_date(.x, unit="15 mins"), floor_date(.y, unit="15 mins"), by="15 mins"))) %>%
select(times) %>%
unnest(times) %>%
group_by(times) %>%
tally()
# # A tibble: 64 x 2
# times n
# <dttm> <int>
# 1 2021-11-09 07:30:00 1
# 2 2021-11-09 07:45:00 2
# 3 2021-11-09 08:00:00 2
# 4 2021-11-09 08:15:00 2
# 5 2021-11-09 08:30:00 2
# 6 2021-11-09 08:45:00 1
# 7 2021-11-09 09:00:00 1
# 8 2021-11-09 09:15:00 1
# 9 2021-11-09 09:30:00 2
# 10 2021-11-09 09:45:00 2
# # ... with 54 more rows
这也可以通过基于范围的联接来完成。 dplyr
不直接支持,在 dplyr
中进行这些连接的方法往往效率低下;我在这里推荐 sqldf
,尽管 data.table
和 fuzzyjoin
解决方案也存在(参见 Merging two data frames by time range in R)。
可重现的数据:
library(dplyr)
library(lubridate)
set.seed(42)
entry_time <- ymd_hms("2021/11/9 6:00:00") + runif(n=20, min=0, max = 14 * 3600)
exit_time <- entry_time + runif(n=20, min=0, max = 5 * 3600)
tbl <- tibble(entry_time, exit_time) %>%
arrange(entry_time) %>%
mutate(customerID = as.character(1:20))
tbl
# # A tibble: 20 x 3
# entry_time exit_time customerID
# <dttm> <dttm> <chr>
# 1 2021-11-09 07:38:41 2021-11-09 08:40:59 1
# 2 2021-11-09 07:53:07 2021-11-09 12:24:50 2
# 3 2021-11-09 09:34:33 2021-11-09 13:00:06 3
# 4 2021-11-09 10:00:21 2021-11-09 14:57:01 4
# 5 2021-11-09 12:24:30 2021-11-09 16:05:46 5
# 6 2021-11-09 12:28:19 2021-11-09 12:29:30 6
# 7 2021-11-09 12:38:59 2021-11-09 17:10:58 7
# 8 2021-11-09 13:16:02 2021-11-09 15:50:18 8
# 9 2021-11-09 13:50:40 2021-11-09 16:54:12 9
# 10 2021-11-09 14:59:03 2021-11-09 15:23:47 10
# 11 2021-11-09 15:11:52 2021-11-09 17:25:57 11
# 12 2021-11-09 15:52:15 2021-11-09 20:03:03 12
# 13 2021-11-09 16:04:03 2021-11-09 20:07:22 13
# 14 2021-11-09 16:18:44 2021-11-09 18:15:47 14
# 15 2021-11-09 17:37:34 2021-11-09 22:21:34 15
# 16 2021-11-09 18:48:26 2021-11-09 23:19:38 16
# 17 2021-11-09 19:05:07 2021-11-09 21:01:33 17
# 18 2021-11-09 19:07:08 2021-11-09 19:48:45 18
# 19 2021-11-09 19:09:36 2021-11-09 23:19:29 19
# 20 2021-11-09 19:41:42 2021-11-09 19:43:54 20
我们将从开始时间和结束时间开始输出帧(我刚刚经过最后一位顾客,您可以使用文字代码将其控制为“晚上 10 点”)。
times <- seq(ymd_hms("2021/11/9 6:00:00"), max(tbl$exit_time) + 30*60, by="15 mins")
times <- tibble(start_time = times[-length(times)], end_time = times[-1])
times
# # A tibble: 71 x 2
# start_time end_time
# <dttm> <dttm>
# 1 2021-11-09 06:00:00 2021-11-09 06:15:00
# 2 2021-11-09 06:15:00 2021-11-09 06:30:00
# 3 2021-11-09 06:30:00 2021-11-09 06:45:00
# 4 2021-11-09 06:45:00 2021-11-09 07:00:00
# 5 2021-11-09 07:00:00 2021-11-09 07:15:00
# 6 2021-11-09 07:15:00 2021-11-09 07:30:00
# 7 2021-11-09 07:30:00 2021-11-09 07:45:00
# 8 2021-11-09 07:45:00 2021-11-09 08:00:00
# 9 2021-11-09 08:00:00 2021-11-09 08:15:00
# 10 2021-11-09 08:15:00 2021-11-09 08:30:00
# # ... with 61 more rows
这是解决方案及其(截断的)输出:
sqldf::sqldf("
select t.start_time, t.end_time, count(*) as n_customers
from times t
left join tbl on (
tbl.entry_time between t.start_time and t.end_time
or tbl.exit_time between t.start_time and t.end_time
or (tbl.entry_time < t.start_time and tbl.exit_time > t.end_time)
)
where tbl.customerid is not null
group by t.start_time, t.end_time") %>%
mutate(across(c(start_time, end_time), ~ `attr<-`(., "tzone", "UTC"))) %>%
head(.)
# start_time end_time n_customers
# 1 2021-11-09 07:30:00 2021-11-09 07:45:00 1
# 2 2021-11-09 07:45:00 2021-11-09 08:00:00 2
# 3 2021-11-09 08:00:00 2021-11-09 08:15:00 2
# 4 2021-11-09 08:15:00 2021-11-09 08:30:00 2
# 5 2021-11-09 08:30:00 2021-11-09 08:45:00 2
# 6 2021-11-09 08:45:00 2021-11-09 09:00:00 1
# ...
(诚然,这里修改了TZ,但重置起来并不难。)