如何使用 R 中的 Dataframes 根据日期时间计算随时间变化的事件并按条件分组
How to count events over time and group by conditions based on datetimes with Dataframes in R
我面临着通过对两个不同的数据集进行一些数据争论来构建 table 的挑战。
数据集A:有服装店的购买信息,变量为:客户姓名、购买日期、代理商和在时间段t内购买的产品。
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-20021 18:21:28",
"Jean", "M_78", "X_3", "8-26-20021 18:11:06",
"Jean", "M_71", "X_4", "8-26-20021 18:21:01",
"Jean", "M_64", "X_4", "8-27-20021 20:21:59",
"Keith", "M_57", "X_4", "8-27-20021 20:21:02",
"Alba", "M_50", "X_1", "8-28-20021 20:21:03",
"Alba", "M_43", "X_3", "8-29-20021 20:21:04",
"Alex", "M_36", "X_2", "8-25-20021 20:21:05"
)
数据集B:包含在时间t内拨打公司CX SERVICE专线的客户信息,存储客户姓名、呼叫日期、呼叫类型等变量。
df2 <- tibble::tribble(
~NAME, ~TYPE, ~DATE_OF_CALL,
"Karen", "COMPLAIN", "8-26-20021 18:21:28",
"Jean", "CX_SERVICE", "8-27-20021 18:11:06",
"Jean", "COMPLAIN", "8-28-20021 18:21:01",
"Jean", "CX_SERVICE", "8-29-20021 20:21:59",
"Keith", "CX_SERVICE", "8-29-20021 20:21:02",
"Alba", "COMPLAIN", "8-30-20021 20:21:03",
"Alex", "CX_SERVICE", "8-25-20021 21:21:05",
)
我必须使用以下内容构建一个数据集:我必须创建一个名为“x attempt”的新变量,它会让我知道这是客户在线上的第一次、第二次、第三次等调用,并为每个客户带回 table 在收到最后一个电话之前购买的最后一个产品,包括电话类型。我知道这听起来可能令人困惑,所以这里是所需 table:
的示例
NAME | x attempt | product | TYPE | DATE_CALL | DATE_PURCHASE |
Jean| | 3 | M_64 |CX_SERVICE | 8-29-20021 20:21:59 | 8-27-20021 20:21:59 |
这个结果是正确的,因为根据记录...这将是 Jean 的第三次通话,最后一次通话的类型是 CX_SERVICE,时间是 8-29-20021 20:21:59,她最后一次购买的产品是 M_64,时间是 8-27-20021 20:21:59。
我相信你可能弄错了年份,所以我删除了年份 (2021) 中多余的零。我看到你正在使用 tibbles,因此我会提供一个 tidyverse
解决这个问题的方法。
所提供代码的思想是首先单独处理小标题,然后通过公分母 NAME
将它们连接起来。
应该这样做:
library(dplyr)
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-2021 18:21:28",
"Jean", "M_78", "X_3", "8-26-2021 18:11:06",
"Jean", "M_71", "X_4", "8-26-2021 18:21:01",
"Jean", "M_64", "X_4", "8-27-2021 20:21:59",
"Keith", "M_57", "X_4", "8-27-2021 20:21:02",
"Alba", "M_50", "X_1", "8-28-2021 20:21:03",
"Alba", "M_43", "X_3", "8-29-2021 20:21:04",
"Alex", "M_36", "X_2", "8-25-2021 20:21:05"
)
df2 <- tibble::tribble(
~NAME, ~TYPE, ~DATE_OF_CALL,
"Karen", "COMPLAIN", "8-26-2021 18:21:28",
"Jean", "CX_SERVICE", "8-27-2021 18:11:06",
"Jean", "COMPLAIN", "8-28-2021 18:21:01",
"Jean", "CX_SERVICE", "8-29-2021 20:21:59",
"Keith", "CX_SERVICE", "8-29-2021 20:21:02",
"Alba", "COMPLAIN", "8-30-2021 20:21:03",
"Alex", "CX_SERVICE", "8-25-2021 21:21:05",
)
(df1_mod <- df1 %>%
mutate(DATE_PURCHASE = as.POSIXct(DATE_PURCHASE, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(product = PRODUCT[DATE_PURCHASE == max(DATE_PURCHASE)], # retrieve product with the most recent date_purchase
DATE_PURCHASE = max(DATE_PURCHASE), # retrieve most recent date_purchase
.groups = "drop"))
#> # A tibble: 5 x 3
#> NAME product DATE_PURCHASE
#> <chr> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04
#> 2 Alex M_36 2021-08-25 20:21:05
#> 3 Jean M_64 2021-08-27 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02
(df2_mod <- df2 %>%
mutate(DATE_OF_CALL = as.POSIXct(DATE_OF_CALL, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(`x attempt` = n(), # retrieve amount of calls, which is n() (the amount of rows in the group)
TYPE = TYPE[DATE_OF_CALL == max(DATE_OF_CALL)], # retrieve type of call from most recent call
DATE_OF_CALL = max(DATE_OF_CALL), # retrieve most recent date_of_call
.groups = "drop"))
#> # A tibble: 5 x 4
#> NAME `x attempt` TYPE DATE_OF_CALL
#> <chr> <int> <chr> <dttm>
#> 1 Alba 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith 1 CX_SERVICE 2021-08-29 20:21:02
left_join(df1_mod, df2_mod, by = "NAME")
#> # A tibble: 5 x 6
#> NAME product DATE_PURCHASE `x attempt` TYPE DATE_OF_CALL
#> <chr> <chr> <dttm> <int> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex M_36 2021-08-25 20:21:05 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean M_64 2021-08-27 20:21:59 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02 1 CX_SERVICE 2021-08-29 20:21:02
Created on 2021-04-10 by the reprex package (v0.3.0)
谢谢,我读到这个要求每次调用输出一行,但我认为上面的内容 return 每个客户一行,我的回答要冗长得多,但对于每次调用它都会 return =13=] 最近一次通话和购买的信息,这可能不是你想要的,但我还是写了:-)
library(tidyverse)
library(lubridate)
df1_purchases <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
product = paste0("Product_",sample(1:500, replace = TRUE, 100)),
agent = paste0("Agent_Name_", sample(LETTERS[1:5], replace = TRUE, 1000)),
purchase_datetime = ymd_hms(paste0(sample(2000:2019,
replace = TRUE,
1000),
"-",
sample(1:12,
replace = TRUE,
1000),
"-",
sample(1:28,
replace = TRUE,
1000),
"-",
sample(1:24,
replace = TRUE,
1000),
"-",
sample(1:59,
replace = TRUE,
1000),
"-",
sample(1:59, replace = TRUE,
1000))))
df2_calls <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
type = paste0("calltype_", sample(c("Complaint", "Service"), replace = TRUE, 1000)),
call_datetime = ymd_hms(paste0(sample(2000:2019,
replace = TRUE,
1000),
"-",
sample(1:12,
replace = TRUE,
1000),
"-",
sample(1:28,
replace = TRUE,
1000),
"-",
sample(0:24,
replace = TRUE,
1000),
"-",
sample(0:59,
replace = TRUE,
10000),
"-",
sample(0:59, replace = TRUE,
1000))))
output_df <- data.frame(cust_name = NULL,
prev_calls = NULL,
prev_purchase_item = NULL,
prev_call_datetime = NULL,
prev_purchase_datetime = NULL)
f <- function(df2_calls, df1_purchases, cust, this_call_datetime) {
df_calls <- df2_calls %>% filter(cust_name == cust)
df_purchases <- df1_purchases %>% filter(cust_name == cust)
if(sum(df_calls$call_datetime < this_call_datetime) < 1) {
cust_name <- cust
prev_calls <- 0
prev_call_type <- NA
prev_call_date_time <- NA
} else {
df_calls <- df2_calls %>% filter(call_datetime < this_call_datetime)
most_recent_call_number_for_cust <- df_calls %>%
pull(call_datetime) %>%
which.max()
cust_name <- cust
prev_calls <- length(df_calls$cust_name)
prev_call_type <- df_calls$type[most_recent_call_number_for_cust]
prev_call_date_time <- max(df_calls$call_datetime)
}
if(sum(df_purchases$purchase_datetime < this_call_datetime) < 1) {
prev_purchase_item <- NA
prev_purchase_datetime <- NA
} else {
most_recent_purchase_for_cust <- df_purchases %>%
filter(purchase_datetime < this_call_datetime) %>%
pull(purchase_datetime) %>%
which.max()
df_purchases <- df_purchases[most_recent_purchase_for_cust,]
prev_purchase_item <- df_purchases$product
prev_purchase_datetime <- df_purchases$purchase_datetime
}
new_row <- data.frame(cust_name = cust,
prev_calls = prev_calls,
prev_purchase_item = prev_purchase_item,
prev_call_datetime = prev_call_date_time,
prev_purchase_datetime = prev_purchase_datetime,
this_call_datetime = this_call_datetime)
new_row
}
number_of_call_rows <- nrow(df2_calls)
for(i in 1:number_of_call_rows) {
output_df <- rbind(output_df,
f(df2_calls,
df1_purchases,
df2_calls$cust_name[i],
df2_calls$call_datetime[i]))
}
glimpse(output_df)
我面临着通过对两个不同的数据集进行一些数据争论来构建 table 的挑战。
数据集A:有服装店的购买信息,变量为:客户姓名、购买日期、代理商和在时间段t内购买的产品。
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-20021 18:21:28",
"Jean", "M_78", "X_3", "8-26-20021 18:11:06",
"Jean", "M_71", "X_4", "8-26-20021 18:21:01",
"Jean", "M_64", "X_4", "8-27-20021 20:21:59",
"Keith", "M_57", "X_4", "8-27-20021 20:21:02",
"Alba", "M_50", "X_1", "8-28-20021 20:21:03",
"Alba", "M_43", "X_3", "8-29-20021 20:21:04",
"Alex", "M_36", "X_2", "8-25-20021 20:21:05"
)
数据集B:包含在时间t内拨打公司CX SERVICE专线的客户信息,存储客户姓名、呼叫日期、呼叫类型等变量。
df2 <- tibble::tribble(
~NAME, ~TYPE, ~DATE_OF_CALL,
"Karen", "COMPLAIN", "8-26-20021 18:21:28",
"Jean", "CX_SERVICE", "8-27-20021 18:11:06",
"Jean", "COMPLAIN", "8-28-20021 18:21:01",
"Jean", "CX_SERVICE", "8-29-20021 20:21:59",
"Keith", "CX_SERVICE", "8-29-20021 20:21:02",
"Alba", "COMPLAIN", "8-30-20021 20:21:03",
"Alex", "CX_SERVICE", "8-25-20021 21:21:05",
)
我必须使用以下内容构建一个数据集:我必须创建一个名为“x attempt”的新变量,它会让我知道这是客户在线上的第一次、第二次、第三次等调用,并为每个客户带回 table 在收到最后一个电话之前购买的最后一个产品,包括电话类型。我知道这听起来可能令人困惑,所以这里是所需 table:
的示例NAME | x attempt | product | TYPE | DATE_CALL | DATE_PURCHASE |
Jean| | 3 | M_64 |CX_SERVICE | 8-29-20021 20:21:59 | 8-27-20021 20:21:59 |
这个结果是正确的,因为根据记录...这将是 Jean 的第三次通话,最后一次通话的类型是 CX_SERVICE,时间是 8-29-20021 20:21:59,她最后一次购买的产品是 M_64,时间是 8-27-20021 20:21:59。
我相信你可能弄错了年份,所以我删除了年份 (2021) 中多余的零。我看到你正在使用 tibbles,因此我会提供一个 tidyverse
解决这个问题的方法。
所提供代码的思想是首先单独处理小标题,然后通过公分母 NAME
将它们连接起来。
应该这样做:
library(dplyr)
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-2021 18:21:28",
"Jean", "M_78", "X_3", "8-26-2021 18:11:06",
"Jean", "M_71", "X_4", "8-26-2021 18:21:01",
"Jean", "M_64", "X_4", "8-27-2021 20:21:59",
"Keith", "M_57", "X_4", "8-27-2021 20:21:02",
"Alba", "M_50", "X_1", "8-28-2021 20:21:03",
"Alba", "M_43", "X_3", "8-29-2021 20:21:04",
"Alex", "M_36", "X_2", "8-25-2021 20:21:05"
)
df2 <- tibble::tribble(
~NAME, ~TYPE, ~DATE_OF_CALL,
"Karen", "COMPLAIN", "8-26-2021 18:21:28",
"Jean", "CX_SERVICE", "8-27-2021 18:11:06",
"Jean", "COMPLAIN", "8-28-2021 18:21:01",
"Jean", "CX_SERVICE", "8-29-2021 20:21:59",
"Keith", "CX_SERVICE", "8-29-2021 20:21:02",
"Alba", "COMPLAIN", "8-30-2021 20:21:03",
"Alex", "CX_SERVICE", "8-25-2021 21:21:05",
)
(df1_mod <- df1 %>%
mutate(DATE_PURCHASE = as.POSIXct(DATE_PURCHASE, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(product = PRODUCT[DATE_PURCHASE == max(DATE_PURCHASE)], # retrieve product with the most recent date_purchase
DATE_PURCHASE = max(DATE_PURCHASE), # retrieve most recent date_purchase
.groups = "drop"))
#> # A tibble: 5 x 3
#> NAME product DATE_PURCHASE
#> <chr> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04
#> 2 Alex M_36 2021-08-25 20:21:05
#> 3 Jean M_64 2021-08-27 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02
(df2_mod <- df2 %>%
mutate(DATE_OF_CALL = as.POSIXct(DATE_OF_CALL, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(`x attempt` = n(), # retrieve amount of calls, which is n() (the amount of rows in the group)
TYPE = TYPE[DATE_OF_CALL == max(DATE_OF_CALL)], # retrieve type of call from most recent call
DATE_OF_CALL = max(DATE_OF_CALL), # retrieve most recent date_of_call
.groups = "drop"))
#> # A tibble: 5 x 4
#> NAME `x attempt` TYPE DATE_OF_CALL
#> <chr> <int> <chr> <dttm>
#> 1 Alba 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith 1 CX_SERVICE 2021-08-29 20:21:02
left_join(df1_mod, df2_mod, by = "NAME")
#> # A tibble: 5 x 6
#> NAME product DATE_PURCHASE `x attempt` TYPE DATE_OF_CALL
#> <chr> <chr> <dttm> <int> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex M_36 2021-08-25 20:21:05 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean M_64 2021-08-27 20:21:59 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02 1 CX_SERVICE 2021-08-29 20:21:02
Created on 2021-04-10 by the reprex package (v0.3.0)
谢谢,我读到这个要求每次调用输出一行,但我认为上面的内容 return 每个客户一行,我的回答要冗长得多,但对于每次调用它都会 return =13=] 最近一次通话和购买的信息,这可能不是你想要的,但我还是写了:-)
library(tidyverse)
library(lubridate)
df1_purchases <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
product = paste0("Product_",sample(1:500, replace = TRUE, 100)),
agent = paste0("Agent_Name_", sample(LETTERS[1:5], replace = TRUE, 1000)),
purchase_datetime = ymd_hms(paste0(sample(2000:2019,
replace = TRUE,
1000),
"-",
sample(1:12,
replace = TRUE,
1000),
"-",
sample(1:28,
replace = TRUE,
1000),
"-",
sample(1:24,
replace = TRUE,
1000),
"-",
sample(1:59,
replace = TRUE,
1000),
"-",
sample(1:59, replace = TRUE,
1000))))
df2_calls <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
type = paste0("calltype_", sample(c("Complaint", "Service"), replace = TRUE, 1000)),
call_datetime = ymd_hms(paste0(sample(2000:2019,
replace = TRUE,
1000),
"-",
sample(1:12,
replace = TRUE,
1000),
"-",
sample(1:28,
replace = TRUE,
1000),
"-",
sample(0:24,
replace = TRUE,
1000),
"-",
sample(0:59,
replace = TRUE,
10000),
"-",
sample(0:59, replace = TRUE,
1000))))
output_df <- data.frame(cust_name = NULL,
prev_calls = NULL,
prev_purchase_item = NULL,
prev_call_datetime = NULL,
prev_purchase_datetime = NULL)
f <- function(df2_calls, df1_purchases, cust, this_call_datetime) {
df_calls <- df2_calls %>% filter(cust_name == cust)
df_purchases <- df1_purchases %>% filter(cust_name == cust)
if(sum(df_calls$call_datetime < this_call_datetime) < 1) {
cust_name <- cust
prev_calls <- 0
prev_call_type <- NA
prev_call_date_time <- NA
} else {
df_calls <- df2_calls %>% filter(call_datetime < this_call_datetime)
most_recent_call_number_for_cust <- df_calls %>%
pull(call_datetime) %>%
which.max()
cust_name <- cust
prev_calls <- length(df_calls$cust_name)
prev_call_type <- df_calls$type[most_recent_call_number_for_cust]
prev_call_date_time <- max(df_calls$call_datetime)
}
if(sum(df_purchases$purchase_datetime < this_call_datetime) < 1) {
prev_purchase_item <- NA
prev_purchase_datetime <- NA
} else {
most_recent_purchase_for_cust <- df_purchases %>%
filter(purchase_datetime < this_call_datetime) %>%
pull(purchase_datetime) %>%
which.max()
df_purchases <- df_purchases[most_recent_purchase_for_cust,]
prev_purchase_item <- df_purchases$product
prev_purchase_datetime <- df_purchases$purchase_datetime
}
new_row <- data.frame(cust_name = cust,
prev_calls = prev_calls,
prev_purchase_item = prev_purchase_item,
prev_call_datetime = prev_call_date_time,
prev_purchase_datetime = prev_purchase_datetime,
this_call_datetime = this_call_datetime)
new_row
}
number_of_call_rows <- nrow(df2_calls)
for(i in 1:number_of_call_rows) {
output_df <- rbind(output_df,
f(df2_calls,
df1_purchases,
df2_calls$cust_name[i],
df2_calls$call_datetime[i]))
}
glimpse(output_df)