如果落在 R 中另一个数据集中的两个变量定义的范围内,则从一个数据集中获取变量值
Get a variables value from one dataset if falling in a range defined by two variables in another dataset in R
我对 R 中的日期操作有疑问。我已经四处寻找了好几天,但在网上找不到任何帮助。我有一个数据集,其中有 id 和两个日期,另一个数据集具有相同的 id 变量、日期和价格。例如:
x = data.frame(id = c("A","B","C","C"),
date1 = c("29/05/2013", "23/08/2011", "25/09/2011", "18/11/2011"),
date2 = c("10/07/2013", "04/10/2011", "10/11/2011", "15/12/2011") )
> x
id date1 date2
1 A 29/05/2013 10/07/2013
2 B 23/08/2011 04/10/2011
3 C 25/09/2011 10/11/2011
4 C 18/11/2011 15/12/2011
y = data.frame(id = c("A","A","A","B","B","B","B","B","B","C","C","C"),
date = c("21/02/2013", "19/06/2013", "31/07/2013", "07/10/2011", "16/01/2012", "10/07/2012","20/09/2012", "29/11/2012", "15/08/2014", "27/09/2011", "27/01/2012", "09/03/2012"),
price = c(126,109,111,14,13.8,14.1,14, 14.4,143,102,114,116))
> y
id date price
1 A 21/02/2013 126.0
2 A 19/06/2013 109.0
3 A 31/07/2013 111.0
4 B 07/10/2011 14.0
5 B 16/01/2012 13.8
6 B 10/07/2012 14.1
7 B 20/09/2012 14.0
8 B 29/11/2012 14.4
9 B 15/08/2014 143.0
10 C 27/09/2011 102.0
11 C 27/01/2012 114.0
12 C 09/03/2012 116.0
我想做的是在数据集 x 中查找两个日期,如果数据集 y 中有一个日期由数据集 x 中的两个日期定义为相同的 id,则选择 price 的值对于该 ID 和日期。如果没有它作为丢失。所以基本上我想得到这样的最终数据集:
final = data.frame(id = c("A","B","C","C"),
date1 = c("29/05/2013", "23/08/2011", "25/09/2011", "18/11/2011"),
date2 = c("10/07/2013", "04/10/2011", "10/11/2011", "15/12/2011"),
date = c("19/06/2013", "NA", "27/09/2011", "NA"),
price = c(109,"NA",102,"NA") )
> final
id date1 date2 date price
1 A 29/05/2013 10/07/2013 19/06/2013 109
2 B 23/08/2011 04/10/2011 20/09/2012 14
3 C 25/09/2011 10/11/2011 27/09/2011 102
4 C 18/11/2011 15/12/2011 NA NA
任何帮助将不胜感激。
我将分两步进行。首先,通过 id 连接每个 df(有关连接的更多详细信息,请参阅 this link),如下所示:
df <- merge(x, y, by = "id")
现在您应该拥有一个完整的数据集,其中的条目比您要求的还要多。要根据您的标准削减它,请尝试:
df <- filter(df, date > date1, date < date2)
我相信这应该有效。
编辑:如果您真的想要 NA 值,而不是仅仅删除该数据,那么它会变得有点毛茸茸。在那种情况下我会做什么,而不是过滤步骤,试试这个:
df$price[date < date1] <- NA
df$price[date > date2] <- NA
df$date[date < date1] <- NA
df$date[date > date2] <- NA
这是一个基于 data.table
包的优秀 foverlaps
的解决方案。
library(data.table)
## coerce characters to dates ( numeric)
setDT(x)[,c("date1","date2"):=list(as.Date(date1,"%d/%m/%Y"),
as.Date(date2,"%d/%m/%Y"))]
## and a dummy date since foverlaps looks for a start,end columns
setDT(y)[,c("date1"):=as.Date(date,"%d/%m/%Y")][,date:=date1]
## y must be keyed
setkey(y,id,date,date1)
foverlaps(x,y,by.x=c("id","date1","date2"))[,
list(id,i.date1,date2,date,price)]
id i.date1 date2 date price
1: A 2013-05-29 2013-07-10 2013-06-19 109
2: B 2011-08-23 2011-10-04 <NA> NA
3: C 2011-09-25 2011-11-10 2011-09-27 102
4: C 2011-11-18 2011-12-15 <NA> NA
PS: 结果不完全一样,因为你的预期输出有误
或 lubridate
和 base R
:
m <- merge(x, y, by='id')
d_range <- m$date1 %--% m$date2
m2 <- m[m$date %within% d_range, ]
res <- merge(x, m2, by=c('id', 'date1', 'date2'), all.x=T)
正如@Isaac 所建议的,合并有助于加快流程。 lubridate 包中的运算符 %--%
创建一个间隔。运算符 %within%
测试 LHS 对象是否位于 RHS 范围内。
id date1 date2 date price
1 A 2013-05-29 2013-07-10 2013-06-19 109
2 B 2011-08-23 2011-10-04 <NA> NA
3 C 2011-09-25 2011-11-10 2011-09-27 102
4 C 2011-11-18 2011-12-15 <NA> NA
数据
x = data.frame(id = c("A","B","C","C"),
date1 = c("29/05/2013", "23/08/2011", "25/09/2011", "18/11/2011"),
date2 = c("10/07/2013", "04/10/2011", "10/11/2011", "15/12/2011"))
y = data.frame(id = c("A","A","A","B","B","B","B","B","B","C","C","C"),
date = c("21/02/2013", "19/06/2013", "31/07/2013", "07/10/2011", "16/01/2012", "10/07/2012","20/09/2012", "29/11/2012", "15/08/2014", "27/09/2011", "27/01/2012", "09/03/2012"),
price = c(126,109,111,14,13.8,14.1,14, 14.4,143,102,114,116))
x[c('date1', 'date2')] <- lapply(x[c('date1', 'date2')], dmy)
y['date'] <- dmy(y[,'date'])
我对 R 中的日期操作有疑问。我已经四处寻找了好几天,但在网上找不到任何帮助。我有一个数据集,其中有 id 和两个日期,另一个数据集具有相同的 id 变量、日期和价格。例如:
x = data.frame(id = c("A","B","C","C"),
date1 = c("29/05/2013", "23/08/2011", "25/09/2011", "18/11/2011"),
date2 = c("10/07/2013", "04/10/2011", "10/11/2011", "15/12/2011") )
> x
id date1 date2
1 A 29/05/2013 10/07/2013
2 B 23/08/2011 04/10/2011
3 C 25/09/2011 10/11/2011
4 C 18/11/2011 15/12/2011
y = data.frame(id = c("A","A","A","B","B","B","B","B","B","C","C","C"),
date = c("21/02/2013", "19/06/2013", "31/07/2013", "07/10/2011", "16/01/2012", "10/07/2012","20/09/2012", "29/11/2012", "15/08/2014", "27/09/2011", "27/01/2012", "09/03/2012"),
price = c(126,109,111,14,13.8,14.1,14, 14.4,143,102,114,116))
> y
id date price
1 A 21/02/2013 126.0
2 A 19/06/2013 109.0
3 A 31/07/2013 111.0
4 B 07/10/2011 14.0
5 B 16/01/2012 13.8
6 B 10/07/2012 14.1
7 B 20/09/2012 14.0
8 B 29/11/2012 14.4
9 B 15/08/2014 143.0
10 C 27/09/2011 102.0
11 C 27/01/2012 114.0
12 C 09/03/2012 116.0
我想做的是在数据集 x 中查找两个日期,如果数据集 y 中有一个日期由数据集 x 中的两个日期定义为相同的 id,则选择 price 的值对于该 ID 和日期。如果没有它作为丢失。所以基本上我想得到这样的最终数据集:
final = data.frame(id = c("A","B","C","C"),
date1 = c("29/05/2013", "23/08/2011", "25/09/2011", "18/11/2011"),
date2 = c("10/07/2013", "04/10/2011", "10/11/2011", "15/12/2011"),
date = c("19/06/2013", "NA", "27/09/2011", "NA"),
price = c(109,"NA",102,"NA") )
> final
id date1 date2 date price
1 A 29/05/2013 10/07/2013 19/06/2013 109
2 B 23/08/2011 04/10/2011 20/09/2012 14
3 C 25/09/2011 10/11/2011 27/09/2011 102
4 C 18/11/2011 15/12/2011 NA NA
任何帮助将不胜感激。
我将分两步进行。首先,通过 id 连接每个 df(有关连接的更多详细信息,请参阅 this link),如下所示:
df <- merge(x, y, by = "id")
现在您应该拥有一个完整的数据集,其中的条目比您要求的还要多。要根据您的标准削减它,请尝试:
df <- filter(df, date > date1, date < date2)
我相信这应该有效。
编辑:如果您真的想要 NA 值,而不是仅仅删除该数据,那么它会变得有点毛茸茸。在那种情况下我会做什么,而不是过滤步骤,试试这个:
df$price[date < date1] <- NA
df$price[date > date2] <- NA
df$date[date < date1] <- NA
df$date[date > date2] <- NA
这是一个基于 data.table
包的优秀 foverlaps
的解决方案。
library(data.table)
## coerce characters to dates ( numeric)
setDT(x)[,c("date1","date2"):=list(as.Date(date1,"%d/%m/%Y"),
as.Date(date2,"%d/%m/%Y"))]
## and a dummy date since foverlaps looks for a start,end columns
setDT(y)[,c("date1"):=as.Date(date,"%d/%m/%Y")][,date:=date1]
## y must be keyed
setkey(y,id,date,date1)
foverlaps(x,y,by.x=c("id","date1","date2"))[,
list(id,i.date1,date2,date,price)]
id i.date1 date2 date price
1: A 2013-05-29 2013-07-10 2013-06-19 109
2: B 2011-08-23 2011-10-04 <NA> NA
3: C 2011-09-25 2011-11-10 2011-09-27 102
4: C 2011-11-18 2011-12-15 <NA> NA
PS: 结果不完全一样,因为你的预期输出有误
或 lubridate
和 base R
:
m <- merge(x, y, by='id')
d_range <- m$date1 %--% m$date2
m2 <- m[m$date %within% d_range, ]
res <- merge(x, m2, by=c('id', 'date1', 'date2'), all.x=T)
正如@Isaac 所建议的,合并有助于加快流程。 lubridate 包中的运算符 %--%
创建一个间隔。运算符 %within%
测试 LHS 对象是否位于 RHS 范围内。
id date1 date2 date price
1 A 2013-05-29 2013-07-10 2013-06-19 109
2 B 2011-08-23 2011-10-04 <NA> NA
3 C 2011-09-25 2011-11-10 2011-09-27 102
4 C 2011-11-18 2011-12-15 <NA> NA
数据
x = data.frame(id = c("A","B","C","C"),
date1 = c("29/05/2013", "23/08/2011", "25/09/2011", "18/11/2011"),
date2 = c("10/07/2013", "04/10/2011", "10/11/2011", "15/12/2011"))
y = data.frame(id = c("A","A","A","B","B","B","B","B","B","C","C","C"),
date = c("21/02/2013", "19/06/2013", "31/07/2013", "07/10/2011", "16/01/2012", "10/07/2012","20/09/2012", "29/11/2012", "15/08/2014", "27/09/2011", "27/01/2012", "09/03/2012"),
price = c(126,109,111,14,13.8,14.1,14, 14.4,143,102,114,116))
x[c('date1', 'date2')] <- lapply(x[c('date1', 'date2')], dmy)
y['date'] <- dmy(y[,'date'])