如何使用 R 中的 Dataframes 根据日期时间计算随时间变化的事件并按条件分组

Question

我面临着通过对两个不同的数据集进行一些数据争论来构建 table 的挑战。

数据集A：有服装店的购买信息，变量为：客户姓名、购买日期、代理商和在时间段t内购买的产品。

df1 <- tibble::tribble(
  ~NAME, ~PRODUCT, ~AGENT,        ~DATE_PURCHASE,
  "Karen",   "M_14",  "X_1", "8-25-20021 18:21:28",
  "Jean",   "M_78",  "X_3", "8-26-20021 18:11:06",
  "Jean",   "M_71",  "X_4", "8-26-20021 18:21:01",
  "Jean",   "M_64",  "X_4", "8-27-20021 20:21:59",
  "Keith",   "M_57",  "X_4", "8-27-20021 20:21:02",
  "Alba",   "M_50",  "X_1", "8-28-20021 20:21:03",
  "Alba",   "M_43",  "X_3", "8-29-20021 20:21:04",
  "Alex",   "M_36",  "X_2", "8-25-20021 20:21:05"
)

数据集B：包含在时间t内拨打公司CX SERVICE专线的客户信息，存储客户姓名、呼叫日期、呼叫类型等变量。

df2 <- tibble::tribble(
  ~NAME,        ~TYPE,         ~DATE_OF_CALL,       
  "Karen",   "COMPLAIN", "8-26-20021 18:21:28", 
  "Jean", "CX_SERVICE", "8-27-20021 18:11:06",
  "Jean",   "COMPLAIN", "8-28-20021 18:21:01", 
  "Jean", "CX_SERVICE", "8-29-20021 20:21:59",
  "Keith", "CX_SERVICE", "8-29-20021 20:21:02", 
  "Alba",   "COMPLAIN", "8-30-20021 20:21:03", 
  "Alex", "CX_SERVICE", "8-25-20021 21:21:05", 
)

我必须使用以下内容构建一个数据集：我必须创建一个名为“x attempt”的新变量，它会让我知道这是客户在线上的第一次、第二次、第三次等调用，并为每个客户带回 table 在收到最后一个电话之前购买的最后一个产品，包括电话类型。我知道这听起来可能令人困惑，所以这里是所需 table:

的示例

NAME   | x attempt | product | TYPE      | DATE_CALL           | DATE_PURCHASE       |
Jean|  |     3     |  M_64   |CX_SERVICE | 8-29-20021 20:21:59 | 8-27-20021 20:21:59 |

这个结果是正确的，因为根据记录...这将是 Jean 的第三次通话，最后一次通话的类型是 CX_SERVICE，时间是 8-29-20021 20:21:59，她最后一次购买的产品是 M_64，时间是 8-27-20021 20:21:59。

Answer 1

我相信你可能弄错了年份，所以我删除了年份 (2021) 中多余的零。我看到你正在使用 tibbles，因此我会提供一个 tidyverse 解决这个问题的方法。

所提供代码的思想是首先单独处理小标题，然后通过公分母 NAME 将它们连接起来。应该这样做：

library(dplyr)

df1 <- tibble::tribble(
  ~NAME, ~PRODUCT, ~AGENT,        ~DATE_PURCHASE,
  "Karen",   "M_14",  "X_1", "8-25-2021 18:21:28",
  "Jean",   "M_78",  "X_3", "8-26-2021 18:11:06",
  "Jean",   "M_71",  "X_4", "8-26-2021 18:21:01",
  "Jean",   "M_64",  "X_4", "8-27-2021 20:21:59",
  "Keith",   "M_57",  "X_4", "8-27-2021 20:21:02",
  "Alba",   "M_50",  "X_1", "8-28-2021 20:21:03",
  "Alba",   "M_43",  "X_3", "8-29-2021 20:21:04",
  "Alex",   "M_36",  "X_2", "8-25-2021 20:21:05"
)

df2 <- tibble::tribble(
  ~NAME,        ~TYPE,         ~DATE_OF_CALL,       
  "Karen",   "COMPLAIN", "8-26-2021 18:21:28", 
  "Jean", "CX_SERVICE", "8-27-2021 18:11:06",
  "Jean",   "COMPLAIN", "8-28-2021 18:21:01", 
  "Jean", "CX_SERVICE", "8-29-2021 20:21:59",
  "Keith", "CX_SERVICE", "8-29-2021 20:21:02", 
  "Alba",   "COMPLAIN", "8-30-2021 20:21:03", 
  "Alex", "CX_SERVICE", "8-25-2021 21:21:05", 
)

(df1_mod <- df1 %>%
  mutate(DATE_PURCHASE = as.POSIXct(DATE_PURCHASE, format = "%m-%d-%Y %H:%M:%S")) %>%
  group_by(NAME) %>%
  summarise(product = PRODUCT[DATE_PURCHASE == max(DATE_PURCHASE)], # retrieve product with the most recent date_purchase
            DATE_PURCHASE = max(DATE_PURCHASE), # retrieve most recent date_purchase
            .groups = "drop"))
#> # A tibble: 5 x 3
#>   NAME  product DATE_PURCHASE      
#>   <chr> <chr>   <dttm>             
#> 1 Alba  M_43    2021-08-29 20:21:04
#> 2 Alex  M_36    2021-08-25 20:21:05
#> 3 Jean  M_64    2021-08-27 20:21:59
#> 4 Karen M_14    2021-08-25 18:21:28
#> 5 Keith M_57    2021-08-27 20:21:02

(df2_mod <- df2 %>%
  mutate(DATE_OF_CALL = as.POSIXct(DATE_OF_CALL, format = "%m-%d-%Y %H:%M:%S")) %>%
  group_by(NAME) %>%
  summarise(`x attempt` = n(), # retrieve amount of calls, which is n() (the amount of rows in the group)
            TYPE = TYPE[DATE_OF_CALL == max(DATE_OF_CALL)], # retrieve type of call from most recent call
            DATE_OF_CALL = max(DATE_OF_CALL), # retrieve most recent date_of_call
            .groups = "drop"))
#> # A tibble: 5 x 4
#>   NAME  `x attempt` TYPE       DATE_OF_CALL       
#>   <chr>       <int> <chr>      <dttm>             
#> 1 Alba            1 COMPLAIN   2021-08-30 20:21:03
#> 2 Alex            1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean            3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen           1 COMPLAIN   2021-08-26 18:21:28
#> 5 Keith           1 CX_SERVICE 2021-08-29 20:21:02
  
left_join(df1_mod, df2_mod, by = "NAME")
#> # A tibble: 5 x 6
#>   NAME  product DATE_PURCHASE       `x attempt` TYPE       DATE_OF_CALL       
#>   <chr> <chr>   <dttm>                    <int> <chr>      <dttm>             
#> 1 Alba  M_43    2021-08-29 20:21:04           1 COMPLAIN   2021-08-30 20:21:03
#> 2 Alex  M_36    2021-08-25 20:21:05           1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean  M_64    2021-08-27 20:21:59           3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen M_14    2021-08-25 18:21:28           1 COMPLAIN   2021-08-26 18:21:28
#> 5 Keith M_57    2021-08-27 20:21:02           1 CX_SERVICE 2021-08-29 20:21:02
Created on 2021-04-10 by the reprex package (v0.3.0)

Answer 2

谢谢，我读到这个要求每次调用输出一行，但我认为上面的内容 return 每个客户一行，我的回答要冗长得多，但对于每次调用它都会 return =13=] 最近一次通话和购买的信息，这可能不是你想要的，但我还是写了:-)


    library(tidyverse)
    library(lubridate)
    
    df1_purchases <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
                      product = paste0("Product_",sample(1:500, replace = TRUE, 100)),
                      agent = paste0("Agent_Name_", sample(LETTERS[1:5], replace = TRUE, 1000)),
                      purchase_datetime = ymd_hms(paste0(sample(2000:2019, 
                                                                replace = TRUE,
                                                                1000),
                                                         "-",
                                                         sample(1:12, 
                                                                replace = TRUE, 
                                                                1000),
                                                         "-",
                                                         
                                                         sample(1:28,
                                                                replace = TRUE,
                                                                1000),
                                                         "-",
                                                         sample(1:24, 
                                                                replace = TRUE,
                                                                1000),
                                                         "-",
                                                         sample(1:59,
                                                                replace = TRUE,
                                                                1000),
                                                         "-",
                                                         sample(1:59, replace = TRUE, 
                                                                1000)))) 
    
    df2_calls <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
               type = paste0("calltype_", sample(c("Complaint", "Service"), replace = TRUE, 1000)),
               call_datetime = ymd_hms(paste0(sample(2000:2019, 
                                                         replace = TRUE,
                                                         1000),
                                                  "-",
                                                  sample(1:12, 
                                                         replace = TRUE, 
                                                         1000),
                                                  "-",
                                                  sample(1:28,
                                                         replace = TRUE,
                                                         1000),
                                                  "-",
                                                  sample(0:24, 
                                                         replace = TRUE,
                                                         1000),
                                                  "-",
                                                  sample(0:59,
                                                         replace = TRUE,
                                                         10000),
                                                  "-",
                                                  sample(0:59, replace = TRUE, 
                                                         1000)))) 
    
    output_df <- data.frame(cust_name = NULL,
                            prev_calls = NULL,
                            prev_purchase_item = NULL,
                            prev_call_datetime = NULL,
                            prev_purchase_datetime = NULL)
    
    f <- function(df2_calls, df1_purchases, cust, this_call_datetime) {
    
    df_calls <- df2_calls %>% filter(cust_name == cust)
    
    df_purchases <- df1_purchases %>% filter(cust_name == cust)
    
    if(sum(df_calls$call_datetime < this_call_datetime) < 1) {
      cust_name <- cust
      prev_calls <- 0
      prev_call_type <- NA
      prev_call_date_time <- NA
      
    } else {
      
      df_calls <- df2_calls %>% filter(call_datetime < this_call_datetime)
      
      most_recent_call_number_for_cust <- df_calls %>%
        pull(call_datetime) %>%
        which.max()
      
      cust_name <- cust
      prev_calls <- length(df_calls$cust_name)
      prev_call_type <- df_calls$type[most_recent_call_number_for_cust]
      prev_call_date_time <- max(df_calls$call_datetime)
    }
    
    if(sum(df_purchases$purchase_datetime < this_call_datetime) < 1) {
      prev_purchase_item <- NA
      prev_purchase_datetime <- NA
      
    } else {
      most_recent_purchase_for_cust <- df_purchases %>%
        filter(purchase_datetime < this_call_datetime) %>%
      pull(purchase_datetime) %>%
        which.max()
      
      df_purchases <- df_purchases[most_recent_purchase_for_cust,] 
      
      
      prev_purchase_item <- df_purchases$product
      
      prev_purchase_datetime <- df_purchases$purchase_datetime 
    }  
    
    new_row <- data.frame(cust_name = cust,
                           prev_calls = prev_calls,
                           prev_purchase_item = prev_purchase_item,
                           prev_call_datetime = prev_call_date_time,
                           prev_purchase_datetime = prev_purchase_datetime,
                           this_call_datetime = this_call_datetime)
    
    new_row
    
    }
    
    number_of_call_rows <- nrow(df2_calls) 
    
    for(i in 1:number_of_call_rows) {
     output_df <- rbind(output_df, 
        f(df2_calls, 
          df1_purchases,
          df2_calls$cust_name[i],
          df2_calls$call_datetime[i]))
    }  
      
    glimpse(output_df)

如何使用 R 中的 Dataframes 根据日期时间计算随时间变化的事件并按条件分组

How to count events over time and group by conditions based on datetimes with Dataframes in R

datetime

group-by

r

dataframe

data-wrangling