合并 R 中的重叠间隔
Merge overlapping intervals in R
我正在尝试合并重叠间隔以计算唯一间隔的总和,同时删除排除的间隔。
这是一个最小的工作示例:
mydata <- data.frame(interval = c(1,2,3,4,5,6,7,8,9,10),
timeoutStart = c(280,500,NA,NA,NA,NA,NA,NA,NA,NA),
timeoutEnd = c(310,530,NA,NA,NA,NA,NA,NA,NA,NA),
cheeringStart = c(1,181,205,330,460,740,NA,NA,NA,NA),
cheeringEnd = c(120,199,300,420,475,760,NA,NA,NA,NA),
possessionStart = c(80,180,210,250,350,450,550,650,750,800),
possessionEnd = c(130,200,220,280,400,499,600,700,800,950)
)
interval timeoutStart timeoutEnd cheeringStart cheeringEnd possessionStart possessionEnd
1 280 310 1 120 80 130
2 500 530 181 199 180 200
3 NA NA 205 300 210 220
4 NA NA 330 420 250 280
5 NA NA 460 475 350 400
6 NA NA 740 760 450 499
7 NA NA NA NA 550 600
8 NA NA NA NA 650 700
9 NA NA NA NA 750 800
10 NA NA NA NA 800 950
在上面的最小工作示例中,我想计算球队花在欢呼或控球上的总时间(不包括暂停)。矩阵中的值表示每个结果(timeout
、cheering
或 possession
)的不同间隔的开始和结束时间(自游戏开始后经过的秒数)。结果不是相互排斥的,可以同时发生。但是,我不想“重复计算”cheering
和 possession
的重叠间隔。也就是说,我想合并 cheering
和 possession
的重叠区间,所以我可以对“唯一”区间求和。
例如,一次欢呼间隔发生在 740 到 760 秒之间,而一次控球间隔与该间隔重叠(750 到 800 秒)。合并间隔为 740 到 800 秒(持续时间 = 60 秒)。
合并 cheering
和 possession
的重叠间隔后,我想排除超时间隔。例如,对于从 205 到 300 秒的唯一间隔,我想排除从 280 到 310 秒的超时间隔。因此,不包括超时间隔的唯一间隔为 205 到 280 秒(持续时间 = 75 秒)。
我想计算每个唯一间隔的持续时间 (End
– Start
) 不包括超时间隔,然后计算所有这些唯一间隔持续时间的总和(不包括超时间隔).最后,我希望能够根据该行中另一个变量(keep
= 0 或 1)的值在计算中包含或排除间隔。
假设 Start
和 End
时间列没有预先排序。我还希望该方法具有通用性,以便能够轻松添加多个附加列集以包含在总和中(例如,运球、传球等)。我查看了其他答案,但没有找到一种方法将他们的解决方案推广到我的情况。
这里是使用 data.table
的 foverlaps()
执行重叠连接的解决方案。
这只是部分解决方案......提供所需的输出会有所帮助。但是您可以在此代码的基础上构建您想要的任何东西..
假设您的数据被命名为 df
library( data.table )
#create data.tables for cheers and possession
cheers.dt <- data.table( interval.cheer = df$interval,
start.cheer = df$cheeringStart,
end.cheer = df$cheeringEnd )[!is.na(start.cheer),]
possession.dt <- data.table( interval.pos = df$interval,
start.pos = df$possessionStart,
end.pos = df$possessionEnd )
#set keys
setkey( cheers.dt, start.cheer, end.cheer )
#perform overlap-join
foverlaps( possession.dt,
cheers.dt,
by.x = c( "start.pos", "end.pos" ),
type = "any",
mult = "all",
nomatch = NULL )
# interval.cheer start.cheer end.cheer interval.pos start.pos end.pos
# 1: 1 1 120 1 80 130
# 2: 2 181 199 2 180 200
# 3: 3 205 300 3 210 220
# 4: 3 205 300 4 250 280
# 5: 4 330 420 5 350 400
# 6: 5 460 475 6 450 499
# 7: 6 740 760 9 750 800
我建议您阅读有关 data.table
的 foverlaps()
函数和非等值连接的内容。
这个怎么样?
mydata <- data.frame(interval = c(1,2,3,4,5,6,7,8,9,10),
timeoutStart = c(280,500,NA,NA,NA,NA,NA,NA,NA,NA),
timeoutEnd = c(310,530,NA,NA,NA,NA,NA,NA,NA,NA),
cheeringStart = c(1,181,205,330,460,740,NA,NA,NA,NA),
cheeringEnd = c(120,199,300,420,475,760,NA,NA,NA,NA),
possessionStart = c(80,180,210,250,350,450,550,650,750,800),
possessionEnd = c(130,200,220,280,400,499,600,700,800,950),
keep = c(rep(FALSE, 2), rep(TRUE, 8)) #added for illustration
)
#add whatever columns you want to use to calculate the merged interval
#they must be in the same order in both vectors
#e.g. if 'cheeringStart' is at index 1, so must 'cheeringEnd'
intervalStartCols <- c('cheeringStart', 'possessionStart')
intervalEndCols <- c('cheeringEnd', 'possessionEnd')
intervalCols <- c(intervalStartCols, intervalEndCols)
timeoutCols <- c('timeoutStart', 'timeoutEnd')
mydata$mergedDuration <- apply(mydata, MARGIN = 1, FUN = function(row){
#return zero if all NAs
if(all(is.na(row[intervalCols]))) return(0)
if(!all(is.na(row[timeoutCols]))){
timeout.start <- row['timeoutStart']
timeout.end <- row['timeoutEnd']
} else {
timeout.end <- 0
}
#identify the maximum time (this will be the end of the merged interval)
max.end <- max(row[intervalEndCols], na.rm=TRUE)
#set intial values
duration <- 0
segment.complete <- FALSE
start.i <- which(row[intervalStartCols] == min(row[intervalStartCols], na.rm=TRUE))
next.step <- row[intervalStartCols][start.i]
waypoints <- row[intervalCols]
waypoints <- waypoints[!is.na(waypoints)]
waypoints <- waypoints[waypoints!=next.step]
#calculate interval duration adjusting for overlap
while(next.step < max.end){
start <- row[intervalStartCols][start.i]
next.step <- waypoints[waypoints == min(waypoints[waypoints!=next.step])]
if(segment.complete){
start.i <- which(row[intervalStartCols] == next.step)
segment.complete <- FALSE
}
end.i <- which(row[intervalEndCols] == next.step)
waypoints <- waypoints[waypoints!=next.step]
if(length(end.i) > 0 && length(start.i) >0 && end.i == start.i) {
segment.start <- row[intervalStartCols][start.i]
segment.end <- row[intervalEndCols][end.i]
segment.duration <- segment.end - segment.start
#adjust for timeout
timeout.adj <- {
if (timeout.end == 0) 0 #this is the NA case
else if(timeout.start > segment.end | timeout.end < segment.start) 0
else if(timeout.end > segment.end & timeout.start < segment.start) segment.duration
else if(timeout.end < segment.end) timeout.end - segment.start
else segment.end - timeout.start
}
duration <- duration + segment.duration - timeout.adj
segment.complete <- TRUE
}
}
duration
})
#sum duration using 'keep' column as mask
summed.duration <- sum(mydata[mydata$keep, 'mergedDuration'])
print(summed.duration)
我正在尝试合并重叠间隔以计算唯一间隔的总和,同时删除排除的间隔。
这是一个最小的工作示例:
mydata <- data.frame(interval = c(1,2,3,4,5,6,7,8,9,10),
timeoutStart = c(280,500,NA,NA,NA,NA,NA,NA,NA,NA),
timeoutEnd = c(310,530,NA,NA,NA,NA,NA,NA,NA,NA),
cheeringStart = c(1,181,205,330,460,740,NA,NA,NA,NA),
cheeringEnd = c(120,199,300,420,475,760,NA,NA,NA,NA),
possessionStart = c(80,180,210,250,350,450,550,650,750,800),
possessionEnd = c(130,200,220,280,400,499,600,700,800,950)
)
interval timeoutStart timeoutEnd cheeringStart cheeringEnd possessionStart possessionEnd
1 280 310 1 120 80 130
2 500 530 181 199 180 200
3 NA NA 205 300 210 220
4 NA NA 330 420 250 280
5 NA NA 460 475 350 400
6 NA NA 740 760 450 499
7 NA NA NA NA 550 600
8 NA NA NA NA 650 700
9 NA NA NA NA 750 800
10 NA NA NA NA 800 950
在上面的最小工作示例中,我想计算球队花在欢呼或控球上的总时间(不包括暂停)。矩阵中的值表示每个结果(timeout
、cheering
或 possession
)的不同间隔的开始和结束时间(自游戏开始后经过的秒数)。结果不是相互排斥的,可以同时发生。但是,我不想“重复计算”cheering
和 possession
的重叠间隔。也就是说,我想合并 cheering
和 possession
的重叠区间,所以我可以对“唯一”区间求和。
例如,一次欢呼间隔发生在 740 到 760 秒之间,而一次控球间隔与该间隔重叠(750 到 800 秒)。合并间隔为 740 到 800 秒(持续时间 = 60 秒)。
合并 cheering
和 possession
的重叠间隔后,我想排除超时间隔。例如,对于从 205 到 300 秒的唯一间隔,我想排除从 280 到 310 秒的超时间隔。因此,不包括超时间隔的唯一间隔为 205 到 280 秒(持续时间 = 75 秒)。
我想计算每个唯一间隔的持续时间 (End
– Start
) 不包括超时间隔,然后计算所有这些唯一间隔持续时间的总和(不包括超时间隔).最后,我希望能够根据该行中另一个变量(keep
= 0 或 1)的值在计算中包含或排除间隔。
假设 Start
和 End
时间列没有预先排序。我还希望该方法具有通用性,以便能够轻松添加多个附加列集以包含在总和中(例如,运球、传球等)。我查看了其他答案,但没有找到一种方法将他们的解决方案推广到我的情况。
这里是使用 data.table
的 foverlaps()
执行重叠连接的解决方案。
这只是部分解决方案......提供所需的输出会有所帮助。但是您可以在此代码的基础上构建您想要的任何东西..
假设您的数据被命名为 df
library( data.table )
#create data.tables for cheers and possession
cheers.dt <- data.table( interval.cheer = df$interval,
start.cheer = df$cheeringStart,
end.cheer = df$cheeringEnd )[!is.na(start.cheer),]
possession.dt <- data.table( interval.pos = df$interval,
start.pos = df$possessionStart,
end.pos = df$possessionEnd )
#set keys
setkey( cheers.dt, start.cheer, end.cheer )
#perform overlap-join
foverlaps( possession.dt,
cheers.dt,
by.x = c( "start.pos", "end.pos" ),
type = "any",
mult = "all",
nomatch = NULL )
# interval.cheer start.cheer end.cheer interval.pos start.pos end.pos
# 1: 1 1 120 1 80 130
# 2: 2 181 199 2 180 200
# 3: 3 205 300 3 210 220
# 4: 3 205 300 4 250 280
# 5: 4 330 420 5 350 400
# 6: 5 460 475 6 450 499
# 7: 6 740 760 9 750 800
我建议您阅读有关 data.table
的 foverlaps()
函数和非等值连接的内容。
这个怎么样?
mydata <- data.frame(interval = c(1,2,3,4,5,6,7,8,9,10),
timeoutStart = c(280,500,NA,NA,NA,NA,NA,NA,NA,NA),
timeoutEnd = c(310,530,NA,NA,NA,NA,NA,NA,NA,NA),
cheeringStart = c(1,181,205,330,460,740,NA,NA,NA,NA),
cheeringEnd = c(120,199,300,420,475,760,NA,NA,NA,NA),
possessionStart = c(80,180,210,250,350,450,550,650,750,800),
possessionEnd = c(130,200,220,280,400,499,600,700,800,950),
keep = c(rep(FALSE, 2), rep(TRUE, 8)) #added for illustration
)
#add whatever columns you want to use to calculate the merged interval
#they must be in the same order in both vectors
#e.g. if 'cheeringStart' is at index 1, so must 'cheeringEnd'
intervalStartCols <- c('cheeringStart', 'possessionStart')
intervalEndCols <- c('cheeringEnd', 'possessionEnd')
intervalCols <- c(intervalStartCols, intervalEndCols)
timeoutCols <- c('timeoutStart', 'timeoutEnd')
mydata$mergedDuration <- apply(mydata, MARGIN = 1, FUN = function(row){
#return zero if all NAs
if(all(is.na(row[intervalCols]))) return(0)
if(!all(is.na(row[timeoutCols]))){
timeout.start <- row['timeoutStart']
timeout.end <- row['timeoutEnd']
} else {
timeout.end <- 0
}
#identify the maximum time (this will be the end of the merged interval)
max.end <- max(row[intervalEndCols], na.rm=TRUE)
#set intial values
duration <- 0
segment.complete <- FALSE
start.i <- which(row[intervalStartCols] == min(row[intervalStartCols], na.rm=TRUE))
next.step <- row[intervalStartCols][start.i]
waypoints <- row[intervalCols]
waypoints <- waypoints[!is.na(waypoints)]
waypoints <- waypoints[waypoints!=next.step]
#calculate interval duration adjusting for overlap
while(next.step < max.end){
start <- row[intervalStartCols][start.i]
next.step <- waypoints[waypoints == min(waypoints[waypoints!=next.step])]
if(segment.complete){
start.i <- which(row[intervalStartCols] == next.step)
segment.complete <- FALSE
}
end.i <- which(row[intervalEndCols] == next.step)
waypoints <- waypoints[waypoints!=next.step]
if(length(end.i) > 0 && length(start.i) >0 && end.i == start.i) {
segment.start <- row[intervalStartCols][start.i]
segment.end <- row[intervalEndCols][end.i]
segment.duration <- segment.end - segment.start
#adjust for timeout
timeout.adj <- {
if (timeout.end == 0) 0 #this is the NA case
else if(timeout.start > segment.end | timeout.end < segment.start) 0
else if(timeout.end > segment.end & timeout.start < segment.start) segment.duration
else if(timeout.end < segment.end) timeout.end - segment.start
else segment.end - timeout.start
}
duration <- duration + segment.duration - timeout.adj
segment.complete <- TRUE
}
}
duration
})
#sum duration using 'keep' column as mask
summed.duration <- sum(mydata[mydata$keep, 'mergedDuration'])
print(summed.duration)