根据重叠日期匹配数据框之间的值
Matching values between data frames based on overlapping dates
我目前正在处理以下数据结构:
属性 df:
ID Begin_A End_A Interval Value
1 5 1990-03-01 2017-03-10 1990-03-01 UTC--2017-03-10 UTC Cat1
2 10 1993-12-01 2017-12-02 1993-12-01 UTC--2017-12-02 UTC Cat2
3 5 1991-03-01 2017-03-03 1991-03-01 UTC--2017-03-03 UTC Cat3
4 10 1995-12-05 2017-12-10 1995-12-05 UTC--2017-12-10 UTC Cat4
预订 df:
ID Begin_A End_A Interval
1 5 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC
2 6 2017-05-03 2017-05-05 2017-05-03 UTC--2017-05-05 UTC
3 8 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC
4 10 2017-12-05 2017-12-06 2017-12-05 UTC--2017-12-06 UTC
正如下面 post: 中已经提到的,我打算进行以下数据重组:从预订中获取 ID,过滤属性数据框的所有行,其中属性 ID与预订 ID 匹配。检查哪些具有匹配属性 ID 的行也有重叠的时间间隔(int_overlaps 来自 lubridate)。然后从 Value 列中取出相应的值并在 Attribute_value 列中打印它们中的每一个。
预期结果如下所示:
ID Begin_A End_A Interval Attribute_value
5 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC Cat1,Cat3
6 2017-05-03 2017-05-05 2017-05-03 UTC--2017-05-05 UTC NA
8 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC NA
10 2017-12-05 2017-12-06 2017-12-05 UTC--2017-12-06 UTC Cat4
ycw 已经在此处对这个问题提供了部分答案:()。此解决方案不允许属性数据框中 Begin_A 和 End_A 之间的较长时间段,因为使用此命令创建了具有各个日期的向量:
complete(Date = full_seq(Date, period = 1), ID) %>%
由于我的原始数据集在 Attributes 数据框中有大量的长时间帧观测值,R 无法处理这些大量的观测值。我的想法是修改上述行以将日期的跳跃减少到几个月(这也会降低精度)或尝试一种新方法。
以下代码生成上面显示的数据帧:
library(lubridate)
library(tidyverse)
# Attributes data frame:
date1 <- as.Date(c('1990-3-1','1993-12-1','1991-3-1','1995-12-5'))
date2 <- as.Date(c('2017-3-10','2017-12-2','2017-3-3','2017-12-10'))
attributes <- data.frame(matrix(NA,nrow=4, ncol = 5))
names(attributes) <- c("ID","Begin_A", "End_A", "Interval", "Value")
attributes$ID <- as.numeric(c(5,10,5,10))
attributes$Begin_A <-date1
attributes$End_A <-date2
attributes$Interval <-attributes$Begin_A %--% attributes$End_A
attributes$Value<- as.character(c("Cat1","Cat2","Cat3","Cat4"))
### Bookings data frame:
date1 <- as.Date(c('2017-3-3','2017-5-3','2017-3-3','2017-12-5'))
date2 <- as.Date(c('2017-3-5','2017-5-5','2017-3-5','2017-12-6'))
bookings <- data.frame(matrix(NA,nrow=4, ncol = 4))
names(bookings) <- c("ID","Begin_A", "End_A", "Interval")
bookings$ID <- as.numeric(c(5,6,8,10))
bookings$Begin_A <-date1
bookings$End_A <-date2
bookings$Interval <-bookings$Begin_A %--% bookings$End_A
这是ycw提供的上一个post的解决方案:
library(tidyverse)
attributes2 <- attributes %>%
select(-Interval) %>%
gather(Type, Date, ends_with("_A")) %>%
select(-Type) %>%
group_by(Value) %>%
complete(Date = full_seq(Date, period = 1), ID) %>%
ungroup()
bookings2 <- bookings %>%
select(-Interval) %>%
gather(Type, Date, ends_with("_A")) %>%
select(-Type) %>%
group_by(ID) %>%
complete(Date = full_seq(Date, period = 1)) %>%
ungroup()
bookings3 <- bookings2 %>%
left_join(attributes2, by = c("ID", "Date")) %>%
group_by(ID) %>%
summarise(Attribute_value = toString(sort(unique(Value)))) %>%
mutate(Attribute_value = ifelse(Attribute_value %in% "", NA, Attribute_value))
bookings4 <- bookings %>% left_join(bookings3, by = "ID")
bookings4
ID Begin_A End_A Interval Attribute_value
1 5 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC Cat1, Cat3
2 6 2017-05-03 2017-05-05 2017-05-03 UTC--2017-05-05 UTC <NA>
3 8 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC <NA>
4 10 2017-12-05 2017-12-06 2017-12-05 UTC--2017-12-06 UTC Cat4
您可以考虑 data.table
,它允许 "non-equi joins",即基于 >=
、>
、<=
和 <
的联接。在同一个调用中,可能会对 LHS 数据集中与 RHS 数据集中的每一行 (i
) 匹配 (by = .EACHI
) 的组执行聚合操作。
d1[d2, on = .(id = id, end >= begin),
.(i.begin, i.end, val_str = toString(val)), by = .EACHI]
# id end i.begin i.end val_str
# 1: 5 2017-03-03 2017-03-03 2017-03-05 Cat3, Cat1
# 2: 6 2017-05-03 2017-05-03 2017-05-05 NA
# 3: 8 2017-03-03 2017-03-03 2017-03-05 NA
# 4: 10 2017-12-05 2017-12-05 2017-12-06 Cat4
资料准备:
d1 <- data.frame(id = c(5, 10, 5, 10),
begin = as.Date(c('1990-3-1','1993-12-1','1991-3-1','1995-12-5')),
end = as.Date(c('2017-3-10','2017-12-2','2017-3-3','2017-12-10')),
val = c("Cat1", "Cat2", "Cat3", "Cat4"))
d2 <- data.frame(id = c(5, 6, 8, 10),
begin = as.Date(c('2017-3-3','2017-5-3','2017-3-3','2017-12-5')),
end = as.Date(c('2017-3-5','2017-5-5','2017-3-5','2017-12-6')))
library(data.table)
setDT(d1)
setDT(d2)
我目前正在处理以下数据结构:
属性 df:
ID Begin_A End_A Interval Value
1 5 1990-03-01 2017-03-10 1990-03-01 UTC--2017-03-10 UTC Cat1
2 10 1993-12-01 2017-12-02 1993-12-01 UTC--2017-12-02 UTC Cat2
3 5 1991-03-01 2017-03-03 1991-03-01 UTC--2017-03-03 UTC Cat3
4 10 1995-12-05 2017-12-10 1995-12-05 UTC--2017-12-10 UTC Cat4
预订 df:
ID Begin_A End_A Interval
1 5 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC
2 6 2017-05-03 2017-05-05 2017-05-03 UTC--2017-05-05 UTC
3 8 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC
4 10 2017-12-05 2017-12-06 2017-12-05 UTC--2017-12-06 UTC
正如下面 post:
预期结果如下所示:
ID Begin_A End_A Interval Attribute_value
5 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC Cat1,Cat3
6 2017-05-03 2017-05-05 2017-05-03 UTC--2017-05-05 UTC NA
8 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC NA
10 2017-12-05 2017-12-06 2017-12-05 UTC--2017-12-06 UTC Cat4
ycw 已经在此处对这个问题提供了部分答案:(
complete(Date = full_seq(Date, period = 1), ID) %>%
由于我的原始数据集在 Attributes 数据框中有大量的长时间帧观测值,R 无法处理这些大量的观测值。我的想法是修改上述行以将日期的跳跃减少到几个月(这也会降低精度)或尝试一种新方法。 以下代码生成上面显示的数据帧:
library(lubridate)
library(tidyverse)
# Attributes data frame:
date1 <- as.Date(c('1990-3-1','1993-12-1','1991-3-1','1995-12-5'))
date2 <- as.Date(c('2017-3-10','2017-12-2','2017-3-3','2017-12-10'))
attributes <- data.frame(matrix(NA,nrow=4, ncol = 5))
names(attributes) <- c("ID","Begin_A", "End_A", "Interval", "Value")
attributes$ID <- as.numeric(c(5,10,5,10))
attributes$Begin_A <-date1
attributes$End_A <-date2
attributes$Interval <-attributes$Begin_A %--% attributes$End_A
attributes$Value<- as.character(c("Cat1","Cat2","Cat3","Cat4"))
### Bookings data frame:
date1 <- as.Date(c('2017-3-3','2017-5-3','2017-3-3','2017-12-5'))
date2 <- as.Date(c('2017-3-5','2017-5-5','2017-3-5','2017-12-6'))
bookings <- data.frame(matrix(NA,nrow=4, ncol = 4))
names(bookings) <- c("ID","Begin_A", "End_A", "Interval")
bookings$ID <- as.numeric(c(5,6,8,10))
bookings$Begin_A <-date1
bookings$End_A <-date2
bookings$Interval <-bookings$Begin_A %--% bookings$End_A
这是ycw提供的上一个post的解决方案:
library(tidyverse)
attributes2 <- attributes %>%
select(-Interval) %>%
gather(Type, Date, ends_with("_A")) %>%
select(-Type) %>%
group_by(Value) %>%
complete(Date = full_seq(Date, period = 1), ID) %>%
ungroup()
bookings2 <- bookings %>%
select(-Interval) %>%
gather(Type, Date, ends_with("_A")) %>%
select(-Type) %>%
group_by(ID) %>%
complete(Date = full_seq(Date, period = 1)) %>%
ungroup()
bookings3 <- bookings2 %>%
left_join(attributes2, by = c("ID", "Date")) %>%
group_by(ID) %>%
summarise(Attribute_value = toString(sort(unique(Value)))) %>%
mutate(Attribute_value = ifelse(Attribute_value %in% "", NA, Attribute_value))
bookings4 <- bookings %>% left_join(bookings3, by = "ID")
bookings4
ID Begin_A End_A Interval Attribute_value
1 5 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC Cat1, Cat3
2 6 2017-05-03 2017-05-05 2017-05-03 UTC--2017-05-05 UTC <NA>
3 8 2017-03-03 2017-03-05 2017-03-03 UTC--2017-03-05 UTC <NA>
4 10 2017-12-05 2017-12-06 2017-12-05 UTC--2017-12-06 UTC Cat4
您可以考虑 data.table
,它允许 "non-equi joins",即基于 >=
、>
、<=
和 <
的联接。在同一个调用中,可能会对 LHS 数据集中与 RHS 数据集中的每一行 (i
) 匹配 (by = .EACHI
) 的组执行聚合操作。
d1[d2, on = .(id = id, end >= begin),
.(i.begin, i.end, val_str = toString(val)), by = .EACHI]
# id end i.begin i.end val_str
# 1: 5 2017-03-03 2017-03-03 2017-03-05 Cat3, Cat1
# 2: 6 2017-05-03 2017-05-03 2017-05-05 NA
# 3: 8 2017-03-03 2017-03-03 2017-03-05 NA
# 4: 10 2017-12-05 2017-12-05 2017-12-06 Cat4
资料准备:
d1 <- data.frame(id = c(5, 10, 5, 10),
begin = as.Date(c('1990-3-1','1993-12-1','1991-3-1','1995-12-5')),
end = as.Date(c('2017-3-10','2017-12-2','2017-3-3','2017-12-10')),
val = c("Cat1", "Cat2", "Cat3", "Cat4"))
d2 <- data.frame(id = c(5, 6, 8, 10),
begin = as.Date(c('2017-3-3','2017-5-3','2017-3-3','2017-12-5')),
end = as.Date(c('2017-3-5','2017-5-5','2017-3-5','2017-12-6')))
library(data.table)
setDT(d1)
setDT(d2)