计算购买之间的时间间隔(带 NA 的日期时间变量)
calculating time gap between purchases (datetime variable with NAs)
df
是一个数据框,显示不同客户的访问和购买时间。
id<-c(1,1,1,1,1,1,2,2,2,2,2)
visit<-rep("yes",11)
purchase<-c("2015-04-27 13:57:06","2015-04-27 13:59:19","2015-04-27 14:03:35","NA","NA","2015-04-27 16:59:42","2015-05-18 17:01:09","2015-05-18 17:03:40","2015-05-18 17:04:00","NA","NA")
df<-data.frame(id,visit,purchase)
当 purchase
变量为 NA
时,表示客户已访问该网站但未进行购买。
现在我需要创建一个名为 time.gap
的新变量来计算每个客户购买之间的时间间隔,如下所示:
id visit purchase time.gap
1 1 yes 2015-04-27 13:57:06 NA
2 1 yes 2015-04-27 13:59:19 133
3 1 yes 2015-04-27 14:03:35 256
4 1 yes NA NA
5 1 yes NA NA
6 1 yes 2015-04-27 16:59:42 10567
7 2 yes 2015-05-18 17:01:09 NA
8 2 yes 2015-05-18 17:03:40 151
9 2 yes 2015-05-18 17:04:00 20
10 2 yes NA NA
11 2 yes NA NA
感谢您的帮助
我会说每个用户 ID 的时间间隔需要一个额外的步骤,在 ID 级别分组。
-EDITED 错过了日期转换。
使用 dplyr
lubridate 和 zoo
的非常简洁的方法
# libraries
library(dplyr)
library(zoo)
library(lubridate)
# the data
id<-c(1,1,1,1,1,1,2,2,2,2,2)
visit<-rep("yes",11)
purchase<-c("2015-04-27 13:57:06","2015-04-27 13:59:19","2015-04-27 14:03:35","NA","NA","2015-04-27 16:59:42","2015-05-18 17:01:09","2015-05-18 17:03:40","2015-05-18 17:04:00","NA","NA")
df<-data.frame(id,visit,purchase)
df$purchase <- lubridate::ymd_hms(df$purchase)
# helper column
df$purch <- zoo::na.locf(df$purchase)
df
#> df
# id visit purchase purch
#1 1 yes 2015-04-27 13:57:06 2015-04-27 13:57:06
#2 1 yes 2015-04-27 13:59:19 2015-04-27 13:59:19
#3 1 yes 2015-04-27 14:03:35 2015-04-27 14:03:35
#4 1 yes <NA> 2015-04-27 14:03:35
#5 1 yes <NA> 2015-04-27 14:03:35
#6 1 yes 2015-04-27 16:59:42 2015-04-27 16:59:42
#7 2 yes 2015-05-18 17:01:09 2015-05-18 17:01:09
#8 2 yes 2015-05-18 17:03:40 2015-05-18 17:03:40
#9 2 yes 2015-05-18 17:04:00 2015-05-18 17:04:00
#10 2 yes <NA> 2015-05-18 17:04:00
#11 2 yes <NA> 2015-05-18 17:04:00
# run it
df%>%
group_by(id)%>%
mutate(dif=c(NA, diff(purch)))%>%
select(-purch)
#Source: local data frame [11 x 4]
#Groups: id
#
# id visit purchase dif
#1 1 yes 2015-04-27 13:57:06 NA
#2 1 yes 2015-04-27 13:59:19 133
#3 1 yes 2015-04-27 14:03:35 256
#4 1 yes <NA> 0
#5 1 yes <NA> 0
#6 1 yes 2015-04-27 16:59:42 10567
#7 2 yes 2015-05-18 17:01:09 NA
#8 2 yes 2015-05-18 17:03:40 151
#9 2 yes 2015-05-18 17:04:00 20
#10 2 yes <NA> 0
#11 2 yes <NA> 0
或一发
df%>%
mutate(purch=zoo::na.locf(lubridate::ymd_hms(df$purchase))) %>%
group_by(id) %>%
mutate(dif=c(NA, diff(purch))) %>%
select(-purch)
# packages [1] lubridate_1.3.3 zoo_1.7-12 dplyr_0.4.2
df$purchase <- strptime(df$purchase, "%Y-%m-%d %H:%M:%S")
df1 <- df
library(dplyr)
df %>%
filter(!is.na(purchase)) %>%
group_by(id) %>%
mutate(time.gap = c(NA, difftime(purchase[-1],
purchase[-length(purchase)], units="secs"))) %>%
left_join(df1, .)
# id visit purchase time.gap
# 1 1 yes 2015-04-27 13:57:06 NA
# 2 1 yes 2015-04-27 13:59:19 133
# 3 1 yes 2015-04-27 14:03:35 256
# 4 1 yes <NA> NA
# 5 1 yes <NA> NA
# 6 1 yes 2015-04-27 16:59:42 10567
# 7 2 yes 2015-05-18 17:01:09 NA
# 8 2 yes 2015-05-18 17:03:40 151
# 9 2 yes 2015-05-18 17:04:00 20
# 10 2 yes <NA> NA
# 11 2 yes <NA> NA
这是一种添加了合并的 dplyr
方法。使用 difftime
而不是 diff
因为它允许单位参数。
数据
id<-c(1,1,1,1,1,1,2,2,2,2,2)
visit<-rep("yes",11)
purchase<-c("2015-04-27 13:57:06","2015-04-27 13:59:19","2015-04-27 14:03:35","NA","NA","2015-04-27 16:59:42","2015-05-18 17:01:09","2015-05-18 17:03:40","2015-05-18 17:04:00","NA","NA")
df<-data.frame(id,visit,purchase)
is.na(df$purchase) <- df$purchase == "NA"
df$purchase <- as.POSIXct(df$purchase)
疑难解答
如果您 运行 发现您的真实数据有问题,我们可以通过测试找出问题的来源:
s <- split(df, df$id)
test <- list()
for(i in 1:length(s)) {
s1 <- s[[i]]
test[[i]] <- s[[i]] %>%
filter(!is.na(purchase)) %>%
group_by(id) %>%
mutate(time.gap = c(NA, difftime(purchase[-1],
purchase[-length(purchase)], units="secs"))) %>%
left_join(s1, .)
}
现在 test
是一个包含代码所有迭代的列表。如果抛出错误,我们将知道它发生在哪里,因为所有其他成功的 运行s 都将被保存。因此,如果我得到一个错误并查看 test
并且它具有直到 id 3
的所有 id,我知道 id 4
导致了错误。
df
是一个数据框,显示不同客户的访问和购买时间。
id<-c(1,1,1,1,1,1,2,2,2,2,2)
visit<-rep("yes",11)
purchase<-c("2015-04-27 13:57:06","2015-04-27 13:59:19","2015-04-27 14:03:35","NA","NA","2015-04-27 16:59:42","2015-05-18 17:01:09","2015-05-18 17:03:40","2015-05-18 17:04:00","NA","NA")
df<-data.frame(id,visit,purchase)
当 purchase
变量为 NA
时,表示客户已访问该网站但未进行购买。
现在我需要创建一个名为 time.gap
的新变量来计算每个客户购买之间的时间间隔,如下所示:
id visit purchase time.gap
1 1 yes 2015-04-27 13:57:06 NA
2 1 yes 2015-04-27 13:59:19 133
3 1 yes 2015-04-27 14:03:35 256
4 1 yes NA NA
5 1 yes NA NA
6 1 yes 2015-04-27 16:59:42 10567
7 2 yes 2015-05-18 17:01:09 NA
8 2 yes 2015-05-18 17:03:40 151
9 2 yes 2015-05-18 17:04:00 20
10 2 yes NA NA
11 2 yes NA NA
感谢您的帮助
我会说每个用户 ID 的时间间隔需要一个额外的步骤,在 ID 级别分组。
-EDITED 错过了日期转换。
使用 dplyr
lubridate 和 zoo
# libraries
library(dplyr)
library(zoo)
library(lubridate)
# the data
id<-c(1,1,1,1,1,1,2,2,2,2,2)
visit<-rep("yes",11)
purchase<-c("2015-04-27 13:57:06","2015-04-27 13:59:19","2015-04-27 14:03:35","NA","NA","2015-04-27 16:59:42","2015-05-18 17:01:09","2015-05-18 17:03:40","2015-05-18 17:04:00","NA","NA")
df<-data.frame(id,visit,purchase)
df$purchase <- lubridate::ymd_hms(df$purchase)
# helper column
df$purch <- zoo::na.locf(df$purchase)
df
#> df
# id visit purchase purch
#1 1 yes 2015-04-27 13:57:06 2015-04-27 13:57:06
#2 1 yes 2015-04-27 13:59:19 2015-04-27 13:59:19
#3 1 yes 2015-04-27 14:03:35 2015-04-27 14:03:35
#4 1 yes <NA> 2015-04-27 14:03:35
#5 1 yes <NA> 2015-04-27 14:03:35
#6 1 yes 2015-04-27 16:59:42 2015-04-27 16:59:42
#7 2 yes 2015-05-18 17:01:09 2015-05-18 17:01:09
#8 2 yes 2015-05-18 17:03:40 2015-05-18 17:03:40
#9 2 yes 2015-05-18 17:04:00 2015-05-18 17:04:00
#10 2 yes <NA> 2015-05-18 17:04:00
#11 2 yes <NA> 2015-05-18 17:04:00
# run it
df%>%
group_by(id)%>%
mutate(dif=c(NA, diff(purch)))%>%
select(-purch)
#Source: local data frame [11 x 4]
#Groups: id
#
# id visit purchase dif
#1 1 yes 2015-04-27 13:57:06 NA
#2 1 yes 2015-04-27 13:59:19 133
#3 1 yes 2015-04-27 14:03:35 256
#4 1 yes <NA> 0
#5 1 yes <NA> 0
#6 1 yes 2015-04-27 16:59:42 10567
#7 2 yes 2015-05-18 17:01:09 NA
#8 2 yes 2015-05-18 17:03:40 151
#9 2 yes 2015-05-18 17:04:00 20
#10 2 yes <NA> 0
#11 2 yes <NA> 0
或一发
df%>%
mutate(purch=zoo::na.locf(lubridate::ymd_hms(df$purchase))) %>%
group_by(id) %>%
mutate(dif=c(NA, diff(purch))) %>%
select(-purch)
# packages [1] lubridate_1.3.3 zoo_1.7-12 dplyr_0.4.2
df$purchase <- strptime(df$purchase, "%Y-%m-%d %H:%M:%S")
df1 <- df
library(dplyr)
df %>%
filter(!is.na(purchase)) %>%
group_by(id) %>%
mutate(time.gap = c(NA, difftime(purchase[-1],
purchase[-length(purchase)], units="secs"))) %>%
left_join(df1, .)
# id visit purchase time.gap
# 1 1 yes 2015-04-27 13:57:06 NA
# 2 1 yes 2015-04-27 13:59:19 133
# 3 1 yes 2015-04-27 14:03:35 256
# 4 1 yes <NA> NA
# 5 1 yes <NA> NA
# 6 1 yes 2015-04-27 16:59:42 10567
# 7 2 yes 2015-05-18 17:01:09 NA
# 8 2 yes 2015-05-18 17:03:40 151
# 9 2 yes 2015-05-18 17:04:00 20
# 10 2 yes <NA> NA
# 11 2 yes <NA> NA
这是一种添加了合并的 dplyr
方法。使用 difftime
而不是 diff
因为它允许单位参数。
数据
id<-c(1,1,1,1,1,1,2,2,2,2,2)
visit<-rep("yes",11)
purchase<-c("2015-04-27 13:57:06","2015-04-27 13:59:19","2015-04-27 14:03:35","NA","NA","2015-04-27 16:59:42","2015-05-18 17:01:09","2015-05-18 17:03:40","2015-05-18 17:04:00","NA","NA")
df<-data.frame(id,visit,purchase)
is.na(df$purchase) <- df$purchase == "NA"
df$purchase <- as.POSIXct(df$purchase)
疑难解答
如果您 运行 发现您的真实数据有问题,我们可以通过测试找出问题的来源:
s <- split(df, df$id)
test <- list()
for(i in 1:length(s)) {
s1 <- s[[i]]
test[[i]] <- s[[i]] %>%
filter(!is.na(purchase)) %>%
group_by(id) %>%
mutate(time.gap = c(NA, difftime(purchase[-1],
purchase[-length(purchase)], units="secs"))) %>%
left_join(s1, .)
}
现在 test
是一个包含代码所有迭代的列表。如果抛出错误,我们将知道它发生在哪里,因为所有其他成功的 运行s 都将被保存。因此,如果我得到一个错误并查看 test
并且它具有直到 id 3
的所有 id,我知道 id 4
导致了错误。