根据 R 中的多个条件(时间差和因子)按组折叠行

Collapse rows by group based on multiple conditions (time difference and factor) in R

我希望根据时间戳 and/or 之间指定的时间差(即 60 分钟)按组折叠数据行,直到在数据中满足特定条件。这是我正在使用的模拟数据框:-

UserId<-c("2203af12ce3e", "2203af12ce3e", "2203af12ce3e", "2203af12ce3e", 
          "3b9c32d4c700", "3b9c32d4c700", "3b9c32d4c700", "3b9c32d4c700", 
          "3b9c32d4c700", "68b25fd3ca78", "68b25fd3ca78", "68b25fd3ca78", 
          "68b25fd3ca78", "68b25fd3ca78", "fbbd0e13e61b", "fbbd0e13e61b", 
          "fbbd0e13e61b", "808dcbe0cad2", "6f2020841f9e", "faf849c35400", 
          "02002044e512", "02002044e512", "02002044e512", "02002044e512", 
          "ff10b8560791", "ff10b8560791", "ff10b8560791", "ff10b8560791", 
          "ff10b8560791", "ff10b8560791", "ff10b8560791", "ff10b8560791", 
          "ff10b8560791", "ff10b8560791", "ff10b8560791", "ff10b8560791", 
          "ff10b8560791", "ff10b8560791", "ff10b8560791", "ff10b8560791")


OrigTime<-structure(c(1546313039, 1546313070, 1546313116, 1546344887, 1546366783, 
                      1546371206, 1546378029, 1546380713, 1546381727, 1546317095, 1546317335, 
                      1546319551, 1546347453, 1546355351, 1546381815, 1546381844, 1546381873, 
                      1546355462, 1546370527, 1546354015, 1546310854, 1546311154, 1546311218, 
                      1546311772, 1546308212, 1546308230, 1546308248, 1546308268, 1546308298, 
                      1546308317, 1546330603, 1546330620, 1546330637, 1546330650, 1546330676, 
                      1546334052, 1546334129, 1546334523, 1546334867, 1546334902), class = c("POSIXct", 
                                                                                             "POSIXt"), tzone = "UTC")

LastTime<-structure(c(1546313039, 1546313070, 1546313448, 1546344887, 1546366783, 
                      1546371206, 1546378029, 1546380713, 1546381727, 1546317095, 1546318123, 
                      1546319551, 1546347453, 1546355351, 1546381815, 1546381844, 1546381873, 
                      1546355462, 1546370527, 1546354015, 1546311063, 1546311154, 1546311746, 
                      1546313128, 1546308212, 1546308230, 1546308248, 1546308268, 1546308298, 
                      1546310346, 1546330603, 1546330620, 1546330637, 1546330650, 1546330676, 
                      1546334052, 1546334129, 1546334523, 1546334867, 1546334902), class = c("POSIXct", 
                                                                                             "POSIXt"), tzone = "UTC")

calls<-c(1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
         4, 1, 3, 4, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

Status<-c("Engaged", "Engaged", "Abandoned", "Abandoned", "Answered", 
          "Answered", "Answered", "Answered", "Answered", "Engaged", "Engaged", 
          "Engaged", "Answered", "Answered", "Engaged", "Engaged", "Answered", 
          "Answered", "Answered", "Answered", "Engaged", "Engaged", "Engaged", 
          "Abandoned", "Engaged", "Engaged", "Engaged", "Engaged", "Engaged", 
          "Answered", "Engaged", "Engaged", "Engaged", "Engaged", "Answered", 
          "Answered", "Answered", "Answered", "Answered", "Answered")

Successful<-c("N", "N", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "N", 
              "Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "N", "N", "N", "N", "N", 
              "N", "N", "N", "N", "Y", "N", "N", "N", "N", "Y", "Y", "Y", "Y", 
              "Y", "Y")


df<-as.data.frame(cbind(UserId,OrigTime,LastTime,calls,Status,Successful))
df$OrigTime<-as.numeric(df$OrigTime)
df$OrigTime<-as.POSIXct(df$OrigTime, origin = "1970-01-01 00:00:00")
df$LastTime<-as.numeric(df$LastTime)
df$LastTime<-as.POSIXct(df$LastTime, origin = "1970-01-01 00:00:00")


下面是我正在努力实现的一些示例:-

示例 1

这里是UserId = "2203af12ce3e",在df:-

之前

df%>%filter(UserId=="2203af12ce3e")
             OrigTime            LastTime calls    Status Successful       UserId
1 2019-01-01 03:23:59 2019-01-01 03:23:59     1   Engaged          N 2203af12ce3e
2 2019-01-01 03:24:30 2019-01-01 03:24:30     1   Engaged          N 2203af12ce3e
3 2019-01-01 03:25:16 2019-01-01 03:30:48     3 Abandoned          N 2203af12ce3e
4 2019-01-01 12:14:47 2019-01-01 12:14:47     1 Abandoned          N 2203af12ce3e

我想折叠第一个 LastTime 和随后的 OrigTime 之间 60 分钟内的行,同时统计 calls 并显示最后一个 [=18] =].所以在这种情况下,我会将第 1-3 行折叠成一行,计算调用次数,最后状态为“已放弃”,如下所示:-

结果

             OrigTime            LastTime calls    Status Successful       UserId
1 2019-01-01 03:23:59 2019-01-01 03:30:48     5 Abandoned          N 2203af12ce3e   
2 2019-01-01 12:14:47 2019-01-01 12:14:47     1 Abandoned          N 2203af12ce3e

请注意,Before 段中第 1 行中的 OrigTime 和第 3 行中的 LastTime 现在在一行中,来自第 1-3 行的调用被统计并从第 3 行中获取最后一个 Status,因为第 1- 3 都发生在 60 分钟内 window。第 4 行保留原样,因为它的 OrigTime 比上一个 LastTime 晚了 >60 分钟。

我想再举一个例子来说明我想做什么。

例2

如果在“已回答”因素出现之前有很多“已放弃”和“参与”因素,并且所有因素都发生在 60 分钟内 window,那么我希望这些行与最后一个一起折叠“已回答”的状态。这是示例 2:-

之前

df%>%filter(UserId=="ff10b8560791")
              OrigTime            LastTime calls   Status Successful       UserId
1  2019-01-01 02:03:32 2019-01-01 02:03:32     1  Engaged          N ff10b8560791
2  2019-01-01 02:03:50 2019-01-01 02:03:50     1  Engaged          N ff10b8560791
3  2019-01-01 02:04:08 2019-01-01 02:04:08     1  Engaged          N ff10b8560791
4  2019-01-01 02:04:28 2019-01-01 02:04:28     1  Engaged          N ff10b8560791
5  2019-01-01 02:04:58 2019-01-01 02:04:58     1  Engaged          N ff10b8560791
6  2019-01-01 02:05:17 2019-01-01 02:39:06     3 Answered          Y ff10b8560791
7  2019-01-01 08:16:43 2019-01-01 08:16:43     1  Engaged          N ff10b8560791
8  2019-01-01 08:17:00 2019-01-01 08:17:00     1  Engaged          N ff10b8560791
9  2019-01-01 08:17:17 2019-01-01 08:17:17     1  Engaged          N ff10b8560791
10 2019-01-01 08:17:30 2019-01-01 08:17:30     1  Engaged          N ff10b8560791
11 2019-01-01 08:17:56 2019-01-01 08:17:56     1 Answered          Y ff10b8560791
12 2019-01-01 09:14:12 2019-01-01 09:14:12     1 Answered          Y ff10b8560791
13 2019-01-01 09:15:29 2019-01-01 09:15:29     1 Answered          Y ff10b8560791
14 2019-01-01 09:22:03 2019-01-01 09:22:03     1 Answered          Y ff10b8560791
15 2019-01-01 09:27:47 2019-01-01 09:27:47     1 Answered          Y ff10b8560791
16 2019-01-01 09:28:22 2019-01-01 09:28:22     1 Answered          Y ff10b8560791

与前面的示例一样,我想折叠在 OrigTime 和之前的 LastTime 的 60 分钟内发生的行,统计调用列并给出该行的最后状态。但是,这里有一个额外的条件:如果该行具有“已回答”状态并且之前的行都发生在该“已回答”行的 60 分钟内,那么我想停在那里并统计之前的行(如果它们发生在 60 分钟内)分钟 window,最终状态为“已回答”。这是此示例的结果:-

结果

              OrigTime            LastTime calls   Status   Successful       UserId
1  2019-01-01 02:03:32 2019-01-01 02:39:06   8    Answered          Y   ff10b8560791
2  2019-01-01 08:16:43 2019-01-01 08:17:56   5    Answered          Y   ff10b8560791
3  2019-01-01 09:14:12 2019-01-01 09:14:12   1    Answered          Y   ff10b8560791
4  2019-01-01 09:15:29 2019-01-01 09:15:29   1    Answered          Y   ff10b8560791
5  2019-01-01 09:22:03 2019-01-01 09:22:03   1    Answered          Y   ff10b8560791
6  2019-01-01 09:27:47 2019-01-01 09:27:47   1    Answered          Y   ff10b8560791
7  2019-01-01 09:28:22 2019-01-01 09:28:22   1    Answered          Y   ff10b8560791

所以在这种情况下,Before 段中的第 1-6 行已折叠,因为它们都在 60 分钟内发生,但在状态为“已回答”时停止;对于第 7-11 行再次发生,对于具有“已回答”状态的其余行,这保持不变。我希望我提供的结果部分能够清楚地说明我正在尝试做什么。任何帮助将不胜感激:)

我们为每个人创建一个新组:

  1. UserId
  2. 如果status = 'Answered'
  3. 60 分钟间隔

对于创建的每个组,我们 select

  1. 第一个OrigTime
  2. 最后一个LastTime
  3. sumCalls
  4. 最后 Status
  5. 最后 Successful.
library(dplyr)
df %>%
  mutate(grp1 = lag(cumsum(Status == 'Answered'), default = 0)) %>%
  group_by(UserId) %>%
  mutate(grp2 = floor(as.numeric(difftime(LastTime, 
                      first(OrigTime), units = 'hours')))) %>%
  group_by(UserId, grp1, grp2) %>%
  summarise(OrigTime = first(OrigTime), 
            LastTime = last(LastTime), 
            calls = sum(calls), 
            Status = last(Status), 
            Successful = last(Successful)) %>%
  ungroup -> result

检查结果:

result %>% filter(UserId == '2203af12ce3e')
# UserId        grp1  grp2 OrigTime            LastTime            calls Status    Successful
#  <chr>        <dbl> <dbl> <dttm>              <dttm>              <dbl> <chr>     <chr>     
#1 2203af12ce3e     0     0 2019-01-01 11:23:59 2019-01-01 11:30:48     5 Abandoned N         
#2 2203af12ce3e     0     8 2019-01-01 20:14:47 2019-01-01 20:14:47     1 Abandoned N

result %>% filter(UserId == 'ff10b8560791')
# A tibble: 7 x 8
#  UserId        grp1  grp2 OrigTime            LastTime            calls Status   Successful
#  <chr>        <dbl> <dbl> <dttm>              <dttm>              <dbl> <chr>    <chr>     
#1 ff10b8560791    11     0 2019-01-01 10:03:32 2019-01-01 10:39:06     8 Answered Y         
#2 ff10b8560791    12     6 2019-01-01 16:16:43 2019-01-01 16:17:56     5 Answered Y         
#3 ff10b8560791    13     7 2019-01-01 17:14:12 2019-01-01 17:14:12     1 Answered Y         
#4 ff10b8560791    14     7 2019-01-01 17:15:29 2019-01-01 17:15:29     1 Answered Y         
#5 ff10b8560791    15     7 2019-01-01 17:22:03 2019-01-01 17:22:03     1 Answered Y         
#6 ff10b8560791    16     7 2019-01-01 17:27:47 2019-01-01 17:27:47     1 Answered Y         
#7 ff10b8560791    17     7 2019-01-01 17:28:22 2019-01-01 17:28:22     1 Answered Y

由于我们所在的时区不同,时间值不同。

要构造数据框,请不要 cbind(因为它将数据转换为矩阵)直接使用 data.frame

df<- data.frame(UserId,OrigTime,LastTime,calls,Status,Successful)