使用 R data.tables 快速计算每个对象的事件数
Fast count of number of Events for each Object with R data.tables
我有许多对象可以位于多个位置(位置的数量比对象的数量少得多),每个对象都有一个开始日期和一个结束日期。我还有许多事件,它们也有位置和发生的日期。我想知道每个对象在逗留期间在同一位置发生的事件数(以便在对象的开始日期和结束日期之间发生)。
由于我有好几组,对象的数量从45万到600万不等,所以这个任务需要相当长的时间。到目前为止,我发现最快的方法是使用 data.table 方法。下面的函数显示了一个示例,您可以在其中改变大小的数量。
coupleEventObject <- function(sizeO=100,sizeE=100){
require(data.table)
require(zoo)
#create the events
Events <- data.table(EventNumber = c(1:sizeE),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeE,replace=T)),
DayEvent = rand.day(day.start="2007-01-01",
day.end ="2015-12-31",
size=sizeE))
#Create the objects
Objects <- data.table(ObjectNumber = c(1:sizeO),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeO,replace=T)),
Day1 = rand.day(day.start="2007-01-01",
day.end ="2015-12-31",
size=sizeO),
Day2 = rand.day(day.start="2007-01-01",
day.end ="2015-12-31",
size=sizeO))
Objects[, DayStart := as.Date(ifelse (Day1>Day2,Day2,Day1))]
Objects[, DayEnd := as.Date(ifelse (Day1<Day2,Day2,Day1))]
Objects[,c("Day1","Day2"):=NULL]
#Set keys right for the coupling/counting
setkey(Objects,Location,DayStart,DayEnd)
setkey(Events,Location,DayEvent)
#Count the number of events
system.time(
Objects[,NumberEvents:=Events[Location,][DayEvent >= DayStart & DayEvent <= DayEnd,.N],by=list(DayStart,DayEnd,Location)]
)
}
rand.day <- function(day.start,day.end,size) {
dayseq <- seq.Date(as.Date(day.start),as.Date(day.end),by="day")
dayselect <- sample(dayseq,size,replace=TRUE)
return(dayselect)
}
对于 100 个对象和 100 个事件,此代码在 0.3 秒内在我的笔记本电脑上运行
> coupleEventObject()
user system elapsed
0.30 0.00 0.29
但是如果我增加对象的数量,这几乎与处理时间成线性比例。
> coupleEventObject(sizeE=200,sizeO=6000)
user system elapsed
15.11 0.00 15.26
因此,计算 600 万个对象的事件数量大约需要 4 个小时,而且我必须多次执行此操作(不同类型的位置级别和)。有没有办法加快速度?感谢您的帮助和想法!
这是一个选项。主要思想是以这样的方式加入事件和对象,即每个组合首先存在,然后只计算有效值。正如您将看到的,新方法运行得更快并且您得到相同的结果。
顺便说一句...您可能需要更改随机日期生成器,因为我无法访问您的原始功能。
coupleEventObject1 <- function(sizeO=100,sizeE=100){
require(data.table)
require(zoo)
#create the events
Events <- data.table(EventNumber = c(1:sizeE),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeE,replace=T)),
DayEvent = as.Date(as.integer(runif(sizeE)*1000)))
#Create the objects
Objects <- data.table(ObjectNumber = c(1:sizeO),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeO,replace=T)),
Day1 = as.Date(as.integer(runif(sizeE)*1000)),
Day2 = as.Date(as.integer(runif(sizeE)*1000)))
Objects[, DayStart := as.Date(ifelse (Day1>Day2,Day2,Day1))]
Objects[, DayEnd := as.Date(ifelse (Day1<Day2,Day2,Day1))]
Objects[,c("Day1","Day2"):=NULL]
#Set keys right for the coupling/counting
setkey(Objects,Location,DayStart,DayEnd)
setkey(Events,Location,DayEvent)
#Count the number of events
cat("First method:")
cat(system.time(
res1 <- Objects[,NumberEvents:=Events[Location,][DayEvent >= DayStart & DayEvent <= DayEnd,.N],by=list(DayStart,DayEnd,Location)]
))
cat("\n")
## second method
#Set keys right for the coupling/counting
setkey(Objects,Location)
setkey(Events,Location)
#Count the number of events
cat("Second method:")
cat(system.time({
oe <- Objects[Events,allow.cartesian=T]
res2 <- oe[,sum(DayEvent >= DayStart & DayEvent <= DayEnd),by=list(ObjectNumber,DayStart,DayEnd,Location)]
}))
cat("\n")
# comparing
setkey(res1, ObjectNumber, Location, DayStart, DayEnd)
setkey(res2, ObjectNumber, Location, DayStart, DayEnd)
cat("Compare values: ", nrow(res1[res2][NumberEvents != V1,])," mismatches\n")
return(list(res1=res1,res2=res2))
}
结果如下:
xx <- coupleEventObject1(200,6000)
First method:8.151 0.041 8.15 0 0
Second method:0.614 0.017 0.625 0 0
Compare values: 0 mismatches
我有许多对象可以位于多个位置(位置的数量比对象的数量少得多),每个对象都有一个开始日期和一个结束日期。我还有许多事件,它们也有位置和发生的日期。我想知道每个对象在逗留期间在同一位置发生的事件数(以便在对象的开始日期和结束日期之间发生)。
由于我有好几组,对象的数量从45万到600万不等,所以这个任务需要相当长的时间。到目前为止,我发现最快的方法是使用 data.table 方法。下面的函数显示了一个示例,您可以在其中改变大小的数量。
coupleEventObject <- function(sizeO=100,sizeE=100){
require(data.table)
require(zoo)
#create the events
Events <- data.table(EventNumber = c(1:sizeE),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeE,replace=T)),
DayEvent = rand.day(day.start="2007-01-01",
day.end ="2015-12-31",
size=sizeE))
#Create the objects
Objects <- data.table(ObjectNumber = c(1:sizeO),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeO,replace=T)),
Day1 = rand.day(day.start="2007-01-01",
day.end ="2015-12-31",
size=sizeO),
Day2 = rand.day(day.start="2007-01-01",
day.end ="2015-12-31",
size=sizeO))
Objects[, DayStart := as.Date(ifelse (Day1>Day2,Day2,Day1))]
Objects[, DayEnd := as.Date(ifelse (Day1<Day2,Day2,Day1))]
Objects[,c("Day1","Day2"):=NULL]
#Set keys right for the coupling/counting
setkey(Objects,Location,DayStart,DayEnd)
setkey(Events,Location,DayEvent)
#Count the number of events
system.time(
Objects[,NumberEvents:=Events[Location,][DayEvent >= DayStart & DayEvent <= DayEnd,.N],by=list(DayStart,DayEnd,Location)]
)
}
rand.day <- function(day.start,day.end,size) {
dayseq <- seq.Date(as.Date(day.start),as.Date(day.end),by="day")
dayselect <- sample(dayseq,size,replace=TRUE)
return(dayselect)
}
对于 100 个对象和 100 个事件,此代码在 0.3 秒内在我的笔记本电脑上运行
> coupleEventObject()
user system elapsed
0.30 0.00 0.29
但是如果我增加对象的数量,这几乎与处理时间成线性比例。
> coupleEventObject(sizeE=200,sizeO=6000)
user system elapsed
15.11 0.00 15.26
因此,计算 600 万个对象的事件数量大约需要 4 个小时,而且我必须多次执行此操作(不同类型的位置级别和)。有没有办法加快速度?感谢您的帮助和想法!
这是一个选项。主要思想是以这样的方式加入事件和对象,即每个组合首先存在,然后只计算有效值。正如您将看到的,新方法运行得更快并且您得到相同的结果。 顺便说一句...您可能需要更改随机日期生成器,因为我无法访问您的原始功能。
coupleEventObject1 <- function(sizeO=100,sizeE=100){
require(data.table)
require(zoo)
#create the events
Events <- data.table(EventNumber = c(1:sizeE),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeE,replace=T)),
DayEvent = as.Date(as.integer(runif(sizeE)*1000)))
#Create the objects
Objects <- data.table(ObjectNumber = c(1:sizeO),
Location = as.character(sample(c(1:floor(sizeO/10)),size=sizeO,replace=T)),
Day1 = as.Date(as.integer(runif(sizeE)*1000)),
Day2 = as.Date(as.integer(runif(sizeE)*1000)))
Objects[, DayStart := as.Date(ifelse (Day1>Day2,Day2,Day1))]
Objects[, DayEnd := as.Date(ifelse (Day1<Day2,Day2,Day1))]
Objects[,c("Day1","Day2"):=NULL]
#Set keys right for the coupling/counting
setkey(Objects,Location,DayStart,DayEnd)
setkey(Events,Location,DayEvent)
#Count the number of events
cat("First method:")
cat(system.time(
res1 <- Objects[,NumberEvents:=Events[Location,][DayEvent >= DayStart & DayEvent <= DayEnd,.N],by=list(DayStart,DayEnd,Location)]
))
cat("\n")
## second method
#Set keys right for the coupling/counting
setkey(Objects,Location)
setkey(Events,Location)
#Count the number of events
cat("Second method:")
cat(system.time({
oe <- Objects[Events,allow.cartesian=T]
res2 <- oe[,sum(DayEvent >= DayStart & DayEvent <= DayEnd),by=list(ObjectNumber,DayStart,DayEnd,Location)]
}))
cat("\n")
# comparing
setkey(res1, ObjectNumber, Location, DayStart, DayEnd)
setkey(res2, ObjectNumber, Location, DayStart, DayEnd)
cat("Compare values: ", nrow(res1[res2][NumberEvents != V1,])," mismatches\n")
return(list(res1=res1,res2=res2))
}
结果如下:
xx <- coupleEventObject1(200,6000)
First method:8.151 0.041 8.15 0 0
Second method:0.614 0.017 0.625 0 0
Compare values: 0 mismatches