如何根据日期计算频率
How to count frequency based on date
我想知道是否有人可以帮助我解决以下问题:
我有两个数据集:
(1) 包含一个 id 和一个 order_date
(2) 第二个包含相同的 id 和 delivery_dates 的电子邮件
我想统计一个人在 order_date 之前收到的电子邮件数量。但是,我无法做到这一点。当我合并两个数据文件时,order_date 与交付日期相结合,这不是我想要的。另外,我不想计算一个人的所有交货日期,因为它需要依赖于时间。
希望有人能帮助我!!
示例数据集 1:
id. order_date age
xx3 2014/07/04 72
xx3 2014/10/08 72
xx3 2014/11/12 72
xx7 2014/05/02 34
xx7 2014/07/09 34
xx9 2014/12/22 55
示例数据集 2:
id. delivery_date
xx3 2014/07/02
xx3 2014/08/10
xx3 2014/11/02
xx3 2014/07/02
xx3 2014/12/02
xx3 2014/12/11
xx7 2014/07/05
我想要什么:
id. frequency_received order_date
xx3 1 2014/07/04
xx3 3 2014/10/08
日期采用 YYYYMMDD 格式。
一个可能的解决方案是使用data.table
-包中的foverlaps
-函数:
library(data.table)
# convert the 'data.frame's to 'data.table's with setDT()
setDT(ds1)
setDT(ds2)
# create a reference dataset with the minimum dates for each id
md <- ds2[, min(delivery_date), id
][ds1[, min(order_date), id], on = 'id'
][is.na(V1), V1 := i.V1
][, mindate := pmin(V1, i.V1)
][, .(id, mindate)]
# create a start date for the time window in which the emails should have been sent
ds1[, bdate := shift(order_date, fill = min(md$mindate[match(id,md$id)])-1), by = id]
# create 2nd deliverydate needed for the foverlaps function
ds2[, delivery_date2 := delivery_date]
# set the keys for each 'data.table'
setkey(ds1, id, bdate, order_date)
setkey(ds2, id, delivery_date, delivery_date2)
# perform the overlap join & calculate the number of recieved emails (freq)
foverlaps(ds1, ds2, nomatch = 0)[, .(freq = .N), by = .(id, order_date)][, freq := cumsum(freq), by = id][]
给出:
id order_date freq
1: xx3 2014-07-04 2
2: xx3 2014-10-08 3
3: xx3 2014-11-12 4
4: xx7 2014-07-09 1
已用数据:
ds1 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 3L), .Label = c("xx3", "xx7", "xx9"), class = "factor"),
order_date = structure(c(16255, 16351, 16386, 16192, 16260, 16426), class = "Date"),
age = c(72L, 72L, 72L, 34L, 34L, 55L)),
.Names = c("id", "order_date", "age"), row.names = c(NA, -6L), class = "data.frame")
ds2 <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("xx3", "xx7"), class = "factor"),
delivery_date = structure(c(16253, 16292, 16376, 16253, 16406, 16415, 16256), class = "Date")),
.Names = c("id", "delivery_date"), row.names = c(NA, -7L), class = "data.frame")
条件合并在dplyr
中还没有实现,你可以使用data.table
,你也可以像这样使用sqldf
:
library(sqldf)
library(lubridate)
library(dplyr)
options(sqldf.driver = "SQLite")
tblA <- data.frame(id = c('xx3', 'xx3', 'xx3', 'xx7', 'xx7', 'xx9'),
order_date = c('2014/07/04', '2014/10/08', '2014/11/12',
'2014/05/02', '2014/07/09', '2014/12/22'),
age = c(72, 72, 72, 34, 34, 55),
stringsAsFactors = FALSE)
tblB <- data.frame(id = c('xx3', 'xx3', 'xx3', 'xx3', 'xx3', 'xx3', 'xx7'),
delivery_date = c('2014/07/02', '2014/08/10', '2014/11/02',
'2014/07/02', '2014/12/02', '2014/1211',
'2014/07/05'),
stringsAsFactors = FALSE)
tblA$order_date <- ymd(tblA$order_date)
tblB$delivery_date <- ymd(tblB$delivery_date)
tblC <- sqldf("select tblA.id, order_date, delivery_date
from tblA
join tblB
on tblA.id = tblB.id
and tblA.order_date >= tblB.delivery_date")
tblC
answer <- tblC %>%
group_by(id, order_date) %>%
summarise(frequency_received = n())
as.data.frame(answer)
这给出:
id order_date frequency_received
1 xx3 2014-07-04 2
2 xx3 2014-10-08 3
3 xx3 2014-11-12 4
4 xx7 2014-07-09 1
我想知道是否有人可以帮助我解决以下问题:
我有两个数据集: (1) 包含一个 id 和一个 order_date (2) 第二个包含相同的 id 和 delivery_dates 的电子邮件
我想统计一个人在 order_date 之前收到的电子邮件数量。但是,我无法做到这一点。当我合并两个数据文件时,order_date 与交付日期相结合,这不是我想要的。另外,我不想计算一个人的所有交货日期,因为它需要依赖于时间。
希望有人能帮助我!!
示例数据集 1:
id. order_date age
xx3 2014/07/04 72
xx3 2014/10/08 72
xx3 2014/11/12 72
xx7 2014/05/02 34
xx7 2014/07/09 34
xx9 2014/12/22 55
示例数据集 2:
id. delivery_date
xx3 2014/07/02
xx3 2014/08/10
xx3 2014/11/02
xx3 2014/07/02
xx3 2014/12/02
xx3 2014/12/11
xx7 2014/07/05
我想要什么:
id. frequency_received order_date
xx3 1 2014/07/04
xx3 3 2014/10/08
日期采用 YYYYMMDD 格式。
一个可能的解决方案是使用data.table
-包中的foverlaps
-函数:
library(data.table)
# convert the 'data.frame's to 'data.table's with setDT()
setDT(ds1)
setDT(ds2)
# create a reference dataset with the minimum dates for each id
md <- ds2[, min(delivery_date), id
][ds1[, min(order_date), id], on = 'id'
][is.na(V1), V1 := i.V1
][, mindate := pmin(V1, i.V1)
][, .(id, mindate)]
# create a start date for the time window in which the emails should have been sent
ds1[, bdate := shift(order_date, fill = min(md$mindate[match(id,md$id)])-1), by = id]
# create 2nd deliverydate needed for the foverlaps function
ds2[, delivery_date2 := delivery_date]
# set the keys for each 'data.table'
setkey(ds1, id, bdate, order_date)
setkey(ds2, id, delivery_date, delivery_date2)
# perform the overlap join & calculate the number of recieved emails (freq)
foverlaps(ds1, ds2, nomatch = 0)[, .(freq = .N), by = .(id, order_date)][, freq := cumsum(freq), by = id][]
给出:
id order_date freq
1: xx3 2014-07-04 2
2: xx3 2014-10-08 3
3: xx3 2014-11-12 4
4: xx7 2014-07-09 1
已用数据:
ds1 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 3L), .Label = c("xx3", "xx7", "xx9"), class = "factor"),
order_date = structure(c(16255, 16351, 16386, 16192, 16260, 16426), class = "Date"),
age = c(72L, 72L, 72L, 34L, 34L, 55L)),
.Names = c("id", "order_date", "age"), row.names = c(NA, -6L), class = "data.frame")
ds2 <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("xx3", "xx7"), class = "factor"),
delivery_date = structure(c(16253, 16292, 16376, 16253, 16406, 16415, 16256), class = "Date")),
.Names = c("id", "delivery_date"), row.names = c(NA, -7L), class = "data.frame")
条件合并在dplyr
中还没有实现,你可以使用data.table
,你也可以像这样使用sqldf
:
library(sqldf)
library(lubridate)
library(dplyr)
options(sqldf.driver = "SQLite")
tblA <- data.frame(id = c('xx3', 'xx3', 'xx3', 'xx7', 'xx7', 'xx9'),
order_date = c('2014/07/04', '2014/10/08', '2014/11/12',
'2014/05/02', '2014/07/09', '2014/12/22'),
age = c(72, 72, 72, 34, 34, 55),
stringsAsFactors = FALSE)
tblB <- data.frame(id = c('xx3', 'xx3', 'xx3', 'xx3', 'xx3', 'xx3', 'xx7'),
delivery_date = c('2014/07/02', '2014/08/10', '2014/11/02',
'2014/07/02', '2014/12/02', '2014/1211',
'2014/07/05'),
stringsAsFactors = FALSE)
tblA$order_date <- ymd(tblA$order_date)
tblB$delivery_date <- ymd(tblB$delivery_date)
tblC <- sqldf("select tblA.id, order_date, delivery_date
from tblA
join tblB
on tblA.id = tblB.id
and tblA.order_date >= tblB.delivery_date")
tblC
answer <- tblC %>%
group_by(id, order_date) %>%
summarise(frequency_received = n())
as.data.frame(answer)
这给出:
id order_date frequency_received
1 xx3 2014-07-04 2
2 xx3 2014-10-08 3
3 xx3 2014-11-12 4
4 xx7 2014-07-09 1