R:计算特定事件之间的时间差
R: calculate time difference between specific events
我有以下数据集:
df = data.frame(cbind(user_id = c(rep(1, 4), rep(2,4)),
complete_order = c(rep(c(1,0,0,1), 2)),
order_date = c('2015-01-28', '2015-01-31', '2015-02-08', '2015-02-23', '2015-01-25', '2015-01-28', '2015-02-06', '2015-02-21')))
library(lubridate)
df$order_date = as_date(df$order_date)
user_id complete_order order_date
1 1 2015-01-28
1 0 2015-01-31
1 0 2015-02-08
1 1 2015-02-23
2 1 2015-01-25
2 0 2015-01-28
2 0 2015-02-06
2 1 2015-02-21
我正在尝试计算每个用户完成的订单之间的天数差异。理想的结果如下所示:
user_id complete_order order_date complete_order_time_diff
<fctr> <fctr> <date> <time>
1 1 2015-01-28 NA days
1 0 2015-01-31 3 days
1 0 2015-02-08 11 days
1 1 2015-02-23 26 days
2 1 2015-01-25 NA days
2 0 2015-01-28 3 days
2 0 2015-02-06 12 days
2 1 2015-02-21 27 days
当我尝试这个解决方案时:
library(dplyr)
df %>%
group_by(user_id) %>%
mutate(complete_order_time_diff = order_date[complete_order==1]-lag(order_date[complete_order==1))
它 returns 错误:
Error: incompatible size (3), expecting 4 (the group size) or 1
任何帮助都将非常有用,谢谢!
我想你可以添加一个 filter
函数来代替 order_date[complete_order == 1]
的子集,并确保 order_date
(和其他变量)是正确的数据类型,方法是添加 stringsAsFactors = F
到 data.frame()
):
df = data.frame(cbind(user_id = c(rep(1, 4), rep(2,4)),
complete_order = c(rep(c(1,1,0,1), 2)),
order_date = c('2015-01-28', '2015-01-31', '2015-02-08', '2015-02-23', '2015-01-25', '2015-01-28', '2015-02-06', '2015-02-21')),
stringsAsFactors = F)
df$order_date <- lubridate::ymd(df$order_date)
df %>%
group_by(user_id) %>%
filter(complete_order == 1) %>%
mutate(complete_order_time_diff = order_date - lag(order_date))
这个 returns 到下一个完整订单的时间(如果没有,则 NA
):
user_id complete_order order_date complete_order_time_diff
<chr> <chr> <date> <time>
1 1 1 2015-01-28 NA days
2 1 1 2015-01-31 3 days
3 1 1 2015-02-23 23 days
4 2 1 2015-01-25 NA days
5 2 1 2015-01-28 3 days
6 2 1 2015-02-21 24 days
试试这个
library(dplyr)
df %>% group_by(user_id, complete_order) %>%
mutate(c1 = order_date - lag(order_date)) %>%
group_by(user_id) %>% mutate(c2 = order_date - lag(order_date)) %>% ungroup %>%
mutate(complete_order_time_diff = ifelse(complete_order==0, c2, c1)) %>%
select(-c(c1, c2))
更新
对于多个取消的订单
df %>% mutate(c3=cumsum( complete_order != "0")) %>% group_by(user_id, complete_order) %>%
mutate(c1 = order_date - lag(order_date)) %>%
group_by(user_id) %>% mutate(c2 = order_date - lag(order_date)) %>%
mutate(c2=as.numeric(c2)) %>% group_by(user_id, c3) %>%
mutate(c2=cumsum(ifelse(complete_order==1, 0, c2))) %>% ungroup %>%
mutate(complete_order_time_diff = ifelse(complete_order==0, c2, c1)) %>%
select(-c(c1, c2, c3))
逻辑
c3
是一个id
,每次有命令(即complete_order not 0
)加1。
c1
计算日差bu user_id
(但对于非完整订单结果是错误的)
c2
修复了 c1
关于非完整订单的不一致问题。
希望这能解决问题。
我建议您使用 group_by()
和 mutate(cumsum())
的组合来更好地理解具有多个分组变量的结果。
您似乎在寻找每个订单与最后一个完成订单的距离。有一个二元向量,x
,c(NA, cummax(x * seq_along(x))[-length(x)])
给出了每个元素之前看到的最后一个“1”的索引。然后,从相应索引处的 "order_date" 中减去 "order_date" 的每个元素即可得到所需的输出。例如
set.seed(1453); x = sample(0:1, 10, TRUE)
set.seed(1821); y = sample(5, 10, TRUE)
cbind(x, y,
last_x = c(NA, cummax(x * seq_along(x))[-length(x)]),
y_diff = y - y[c(NA, cummax(x * seq_along(x))[-length(x)])])
# x y last_x y_diff
# [1,] 1 3 NA NA
# [2,] 0 3 1 0
# [3,] 1 5 1 2
# [4,] 0 1 3 -4
# [5,] 0 3 3 -2
# [6,] 1 5 3 0
# [7,] 1 1 6 -4
# [8,] 0 3 7 2
# [9,] 0 4 7 3
#[10,] 1 5 7 4
关于你的数据,为了方便起见,先格式化df
:
df$order_date = as.Date(df$order_date)
df$complete_order = df$complete_order == "1" # lose the 'factor'
然后,在 group_by
:
之后应用上述方法
library(dplyr)
df %>% group_by(user_id) %>%
mutate(time_diff = order_date -
order_date[c(NA, cummax(complete_order * seq_along(complete_order))[-length(complete_order)])])
,或者,在考虑 "user_id" 变化的索引后,尝试避免分组的操作(假设有序 "user_id"):
# save variables to vectors and keep a "logical" of when "id" changes
id = df$user_id
id_change = c(TRUE, id[-1] != id[-length(id)])
compl = df$complete_order
dord = df$order_date
# accounting for changes in "id", locate last completed order
i = c(NA, cummax((compl | id_change) * seq_along(compl))[-length(compl)])
is.na(i) = id_change
dord - dord[i]
#Time differences in days
#[1] NA 3 11 26 NA 3 12 27
我有以下数据集:
df = data.frame(cbind(user_id = c(rep(1, 4), rep(2,4)),
complete_order = c(rep(c(1,0,0,1), 2)),
order_date = c('2015-01-28', '2015-01-31', '2015-02-08', '2015-02-23', '2015-01-25', '2015-01-28', '2015-02-06', '2015-02-21')))
library(lubridate)
df$order_date = as_date(df$order_date)
user_id complete_order order_date
1 1 2015-01-28
1 0 2015-01-31
1 0 2015-02-08
1 1 2015-02-23
2 1 2015-01-25
2 0 2015-01-28
2 0 2015-02-06
2 1 2015-02-21
我正在尝试计算每个用户完成的订单之间的天数差异。理想的结果如下所示:
user_id complete_order order_date complete_order_time_diff
<fctr> <fctr> <date> <time>
1 1 2015-01-28 NA days
1 0 2015-01-31 3 days
1 0 2015-02-08 11 days
1 1 2015-02-23 26 days
2 1 2015-01-25 NA days
2 0 2015-01-28 3 days
2 0 2015-02-06 12 days
2 1 2015-02-21 27 days
当我尝试这个解决方案时:
library(dplyr)
df %>%
group_by(user_id) %>%
mutate(complete_order_time_diff = order_date[complete_order==1]-lag(order_date[complete_order==1))
它 returns 错误:
Error: incompatible size (3), expecting 4 (the group size) or 1
任何帮助都将非常有用,谢谢!
我想你可以添加一个 filter
函数来代替 order_date[complete_order == 1]
的子集,并确保 order_date
(和其他变量)是正确的数据类型,方法是添加 stringsAsFactors = F
到 data.frame()
):
df = data.frame(cbind(user_id = c(rep(1, 4), rep(2,4)),
complete_order = c(rep(c(1,1,0,1), 2)),
order_date = c('2015-01-28', '2015-01-31', '2015-02-08', '2015-02-23', '2015-01-25', '2015-01-28', '2015-02-06', '2015-02-21')),
stringsAsFactors = F)
df$order_date <- lubridate::ymd(df$order_date)
df %>%
group_by(user_id) %>%
filter(complete_order == 1) %>%
mutate(complete_order_time_diff = order_date - lag(order_date))
这个 returns 到下一个完整订单的时间(如果没有,则 NA
):
user_id complete_order order_date complete_order_time_diff
<chr> <chr> <date> <time>
1 1 1 2015-01-28 NA days
2 1 1 2015-01-31 3 days
3 1 1 2015-02-23 23 days
4 2 1 2015-01-25 NA days
5 2 1 2015-01-28 3 days
6 2 1 2015-02-21 24 days
试试这个
library(dplyr)
df %>% group_by(user_id, complete_order) %>%
mutate(c1 = order_date - lag(order_date)) %>%
group_by(user_id) %>% mutate(c2 = order_date - lag(order_date)) %>% ungroup %>%
mutate(complete_order_time_diff = ifelse(complete_order==0, c2, c1)) %>%
select(-c(c1, c2))
更新
对于多个取消的订单
df %>% mutate(c3=cumsum( complete_order != "0")) %>% group_by(user_id, complete_order) %>%
mutate(c1 = order_date - lag(order_date)) %>%
group_by(user_id) %>% mutate(c2 = order_date - lag(order_date)) %>%
mutate(c2=as.numeric(c2)) %>% group_by(user_id, c3) %>%
mutate(c2=cumsum(ifelse(complete_order==1, 0, c2))) %>% ungroup %>%
mutate(complete_order_time_diff = ifelse(complete_order==0, c2, c1)) %>%
select(-c(c1, c2, c3))
逻辑
c3
是一个id
,每次有命令(即complete_order not 0
)加1。
c1
计算日差bu user_id
(但对于非完整订单结果是错误的)
c2
修复了 c1
关于非完整订单的不一致问题。
希望这能解决问题。
我建议您使用 group_by()
和 mutate(cumsum())
的组合来更好地理解具有多个分组变量的结果。
您似乎在寻找每个订单与最后一个完成订单的距离。有一个二元向量,x
,c(NA, cummax(x * seq_along(x))[-length(x)])
给出了每个元素之前看到的最后一个“1”的索引。然后,从相应索引处的 "order_date" 中减去 "order_date" 的每个元素即可得到所需的输出。例如
set.seed(1453); x = sample(0:1, 10, TRUE)
set.seed(1821); y = sample(5, 10, TRUE)
cbind(x, y,
last_x = c(NA, cummax(x * seq_along(x))[-length(x)]),
y_diff = y - y[c(NA, cummax(x * seq_along(x))[-length(x)])])
# x y last_x y_diff
# [1,] 1 3 NA NA
# [2,] 0 3 1 0
# [3,] 1 5 1 2
# [4,] 0 1 3 -4
# [5,] 0 3 3 -2
# [6,] 1 5 3 0
# [7,] 1 1 6 -4
# [8,] 0 3 7 2
# [9,] 0 4 7 3
#[10,] 1 5 7 4
关于你的数据,为了方便起见,先格式化df
:
df$order_date = as.Date(df$order_date)
df$complete_order = df$complete_order == "1" # lose the 'factor'
然后,在 group_by
:
library(dplyr)
df %>% group_by(user_id) %>%
mutate(time_diff = order_date -
order_date[c(NA, cummax(complete_order * seq_along(complete_order))[-length(complete_order)])])
,或者,在考虑 "user_id" 变化的索引后,尝试避免分组的操作(假设有序 "user_id"):
# save variables to vectors and keep a "logical" of when "id" changes
id = df$user_id
id_change = c(TRUE, id[-1] != id[-length(id)])
compl = df$complete_order
dord = df$order_date
# accounting for changes in "id", locate last completed order
i = c(NA, cummax((compl | id_change) * seq_along(compl))[-length(compl)])
is.na(i) = id_change
dord - dord[i]
#Time differences in days
#[1] NA 3 11 26 NA 3 12 27