使用进出时间记录每15分钟统计客户

count customers in every 15 minutes using entry exit time record

您好,我有商店顾客进出时间。我想从早上 6 点到晚上 10 点每隔 15 分钟统计一次店内顾客的数量。

谢谢

library(lubridate)
library(tidyverse)


entry_time <- ymd_hms("2021/11/9 6:00:00") +  + runif(n=20, min=0, max = 14 * 3600)
exit_time <- entry_time + runif(n=20, min=0, max = 5 * 3600)

# tibble
tbl <- tibble(entry_time, exit_time) %>%
  arrange(entry_time) %>%
  mutate(customerID = fct_rev(factor(1:20)))


# for visualization ---------------------------------------------------------

tbl_longer <- tbl %>%
  pivot_longer(cols = ends_with("time"),
               names_to = "time", 
               values_to = "value")

ggplot(tbl_longer,
       aes(x = customerID, 
           y = value)) + 
  geom_line(size = 3) + 
  coord_flip() + 
  theme_bw() + 
  theme(aspect.ratio = 0.7)

这里有一个 tidyverse 方法可以做到这一点不是基于范围的连接,但它有一些限制:

  • 它与已知访问相符,因此如果您想要特定的时间范围(即,从早上 6 点开始),那么您需要将这些值从外部引入 0
  • 它使用 lubridate::floor_date 并将所有 entry/exit 时间截断为之前的 15 分钟间隔;我认为这足以满足您的需求,但请进行一些检查以确保所有客户都已入账
library(dplyr)
library(purrr)     # map2
library(lubridate) # floor_date
tbl %>%
  mutate(times = purrr::map2(entry_time, exit_time, ~ seq(floor_date(.x, unit="15 mins"), floor_date(.y, unit="15 mins"), by="15 mins"))) %>%
  select(times) %>%
  unnest(times) %>%
  group_by(times) %>%
  tally()
# # A tibble: 64 x 2
#    times                   n
#    <dttm>              <int>
#  1 2021-11-09 07:30:00     1
#  2 2021-11-09 07:45:00     2
#  3 2021-11-09 08:00:00     2
#  4 2021-11-09 08:15:00     2
#  5 2021-11-09 08:30:00     2
#  6 2021-11-09 08:45:00     1
#  7 2021-11-09 09:00:00     1
#  8 2021-11-09 09:15:00     1
#  9 2021-11-09 09:30:00     2
# 10 2021-11-09 09:45:00     2
# # ... with 54 more rows

这也可以通过基于范围的联接来完成。 dplyr 不直接支持,在 dplyr 中进行这些连接的方法往往效率低下;我在这里推荐 sqldf,尽管 data.tablefuzzyjoin 解决方案也存在(参见 Merging two data frames by time range in R)。

可重现的数据:

library(dplyr)
library(lubridate)
set.seed(42)
entry_time <- ymd_hms("2021/11/9 6:00:00") + runif(n=20, min=0, max = 14 * 3600)
exit_time <- entry_time + runif(n=20, min=0, max = 5 * 3600)
tbl <- tibble(entry_time, exit_time) %>%
  arrange(entry_time) %>%
  mutate(customerID = as.character(1:20))

tbl
# # A tibble: 20 x 3
#    entry_time          exit_time           customerID
#    <dttm>              <dttm>              <chr>     
#  1 2021-11-09 07:38:41 2021-11-09 08:40:59 1         
#  2 2021-11-09 07:53:07 2021-11-09 12:24:50 2         
#  3 2021-11-09 09:34:33 2021-11-09 13:00:06 3         
#  4 2021-11-09 10:00:21 2021-11-09 14:57:01 4         
#  5 2021-11-09 12:24:30 2021-11-09 16:05:46 5         
#  6 2021-11-09 12:28:19 2021-11-09 12:29:30 6         
#  7 2021-11-09 12:38:59 2021-11-09 17:10:58 7         
#  8 2021-11-09 13:16:02 2021-11-09 15:50:18 8         
#  9 2021-11-09 13:50:40 2021-11-09 16:54:12 9         
# 10 2021-11-09 14:59:03 2021-11-09 15:23:47 10        
# 11 2021-11-09 15:11:52 2021-11-09 17:25:57 11        
# 12 2021-11-09 15:52:15 2021-11-09 20:03:03 12        
# 13 2021-11-09 16:04:03 2021-11-09 20:07:22 13        
# 14 2021-11-09 16:18:44 2021-11-09 18:15:47 14        
# 15 2021-11-09 17:37:34 2021-11-09 22:21:34 15        
# 16 2021-11-09 18:48:26 2021-11-09 23:19:38 16        
# 17 2021-11-09 19:05:07 2021-11-09 21:01:33 17        
# 18 2021-11-09 19:07:08 2021-11-09 19:48:45 18        
# 19 2021-11-09 19:09:36 2021-11-09 23:19:29 19        
# 20 2021-11-09 19:41:42 2021-11-09 19:43:54 20        

我们将从开始时间和结束时间开始输出帧(我刚刚经过最后一位顾客,您可以使用文字代码将其控制为“晚上 10 点”)。

times <- seq(ymd_hms("2021/11/9 6:00:00"), max(tbl$exit_time) + 30*60, by="15 mins")
times <- tibble(start_time = times[-length(times)], end_time = times[-1])
times
# # A tibble: 71 x 2
#    start_time          end_time           
#    <dttm>              <dttm>             
#  1 2021-11-09 06:00:00 2021-11-09 06:15:00
#  2 2021-11-09 06:15:00 2021-11-09 06:30:00
#  3 2021-11-09 06:30:00 2021-11-09 06:45:00
#  4 2021-11-09 06:45:00 2021-11-09 07:00:00
#  5 2021-11-09 07:00:00 2021-11-09 07:15:00
#  6 2021-11-09 07:15:00 2021-11-09 07:30:00
#  7 2021-11-09 07:30:00 2021-11-09 07:45:00
#  8 2021-11-09 07:45:00 2021-11-09 08:00:00
#  9 2021-11-09 08:00:00 2021-11-09 08:15:00
# 10 2021-11-09 08:15:00 2021-11-09 08:30:00
# # ... with 61 more rows

这是解决方案及其(截断的)输出:

sqldf::sqldf("
  select t.start_time, t.end_time, count(*) as n_customers 
  from times t 
    left join tbl on (
      tbl.entry_time between t.start_time and t.end_time 
        or tbl.exit_time between t.start_time and t.end_time 
        or (tbl.entry_time < t.start_time and tbl.exit_time > t.end_time)
      ) 
  where tbl.customerid is not null 
  group by t.start_time, t.end_time") %>%
  mutate(across(c(start_time, end_time), ~ `attr<-`(., "tzone", "UTC"))) %>%
  head(.)
#            start_time            end_time n_customers
# 1 2021-11-09 07:30:00 2021-11-09 07:45:00           1
# 2 2021-11-09 07:45:00 2021-11-09 08:00:00           2
# 3 2021-11-09 08:00:00 2021-11-09 08:15:00           2
# 4 2021-11-09 08:15:00 2021-11-09 08:30:00           2
# 5 2021-11-09 08:30:00 2021-11-09 08:45:00           2
# 6 2021-11-09 08:45:00 2021-11-09 09:00:00           1
# ...

(诚然,这里修改了TZ,但重置起来并不难。)