R 时间间隔:当样本超过小时标记时按一天中的小时分组
R Time Intervals: Grouping by hour of day when sample goes over the hour mark
我有两种鸟类行为持续时间的视频数据,当鸟在巢上时和鸟离开巢时。对于我的分析,我需要获取每小时上下巢的持续时间。然而,不同的行为常常会重叠小时标记。例如,这只鸟在 4:10-4:42 和 4:50 - 5:20 的巢穴上,我需要将第二个时间段分隔为 4:50-5:00和 5:00-5:20 这样我就可以每小时求和了。我已经用 package lubridate 寻找了很长一段时间,但没有找到这样做的方法,但我认为必须有一些东西在那里。有什么建议吗?
示例数据如下。 "off.time.diff" 是 "off.bout.id" 之间的秒数差异,"on.time.diff" 也是如此。举个例子,小鸟从 17:25:39 到 18:03:29。我可以获得总时间(2270 秒),但无法弄清楚如何将它每小时分开。
Event DT.event off.bout.ID on.bout.ID off.time.diff on.time.diff
off 4/27/12 17:25:13 1 0 NA NA
on 4/27/12 17:25:39 1 1 26 NA
off 4/27/12 18:03:29 2 1 NA 2270
on 4/27/12 18:03:57 2 2 28 NA
off 4/27/12 19:41:16 3 2 NA 5839
on 4/27/12 19:43:50 3 3 154 NA
off 4/28/12 6:23:57 4 3 NA 38407
on 4/28/12 6:32:13 4 4 496 NA
off 4/28/12 6:40:20 5 4 NA 487
on 4/28/12 6:40:48 5 5 28 NA
off 4/28/12 8:16:07 6 5 NA 5719
这是一个想法
library(dplyr)
library(lubridate)
# Yours data
a =
read.csv(header = F, sep = ";", stringsAsFactors = F,
col.names = c("Event","DT.event","off.bout.ID","on.bout.ID","off.time.diff","on.time.diff"),
text = gsub(pattern = "\s+{2}",replacement = ";",
x="off 4/27/12 17:25:13 1 0 NA NA
on 4/27/12 17:25:39 1 1 26 NA
off 4/27/12 18:03:29 2 1 NA 2270
on 4/27/12 18:03:57 2 2 28 NA
off 4/27/12 19:41:16 3 2 NA 5839
on 4/27/12 19:43:50 3 3 154 NA
off 4/28/12 6:23:57 4 3 NA 38407
on 4/28/12 6:32:13 4 4 496 NA
off 4/28/12 6:40:20 5 4 NA 487
on 4/28/12 6:40:48 5 5 28 NA
off 4/28/12 8:16:07 6 5 NA 5719"
)
) %>% mutate(DT.event = as.POSIXct(DT.event, format = "%m/%d/%Y %H:%M:%S")
)
# Ordering by time, if it isn't ordered
a = a[order(a$DT.event),]
# Build a trick column to calculate time difs with 'next_event'
a[,"next_eve"] = as.POSIXct(c(a$DT.event[2:nrow(a)],NA))
# Build column with time difference by "complete" hours
a = a %>%
mutate(dif_comp_hour_sec =
case_when(
floor_date(next_eve,unit = "hour") > floor_date(next_eve,unit = "hour") ~ as.numeric(floor_date(next_eve,unit = "hour") - DT.event),
T ~ as.numeric(next_eve - DT.event )
)
)
如果需要,您可以使用列 "Event" 再次拆分为 on/off 列。
此处输出:
# Event DT.event off.bout.ID on.bout.ID off.time.diff on.time.diff next_eve dif_comp_hour_sec
# 1 off 12-04-27 17:25:13 1 0 NA NA 12-04-27 17:25:39 26
# 2 on 12-04-27 17:25:39 1 1 26 NA 12-04-27 18:03:29 2270
# 3 off 12-04-27 18:03:29 2 1 NA 2270 12-04-27 18:03:57 28
# 4 on 12-04-27 18:03:57 2 2 28 NA 12-04-27 19:41:16 5839
# 5 off 12-04-27 19:41:16 3 2 NA 5839 12-04-27 19:43:50 154
# 6 on 12-04-27 19:43:50 3 3 154 NA 12-04-28 06:23:57 38407
# 7 off 12-04-28 06:23:57 4 3 NA 38407 12-04-28 06:32:13 496
# 8 on 12-04-28 06:32:13 4 4 496 NA 12-04-28 06:40:20 487
# 9 off 12-04-28 06:40:20 5 4 NA 487 12-04-28 06:40:48 28
# 10 on 12-04-28 06:40:48 5 5 28 NA 12-04-28 08:16:07 5719
# 11 off 12-04-28 08:16:07 6 5 NA 5719 <NA> NA
我的建议背后的想法是检查每个事件经过了多少个完整的小时标记,并为每个小时插入一个额外的行并相应地更改时间...
加载示例数据:
df <- read.table(text='Event DT.event off.bout.ID on.bout.ID off.time.diff on.time.diff
off 4/27/12-17:25:13 1 0 NA NA
on 4/27/12-17:25:39 1 1 26 NA
off 4/27/12-18:03:29 2 1 NA 2270
on 4/27/12-18:03:57 2 2 28 NA
off 4/27/12-19:41:16 3 2 NA 5839
on 4/27/12-19:43:50 3 3 154 NA
off 4/28/12-6:23:57 4 3 NA 38407
on 4/28/12-6:32:13 4 4 496 NA
off 4/28/12-6:40:20 5 4 NA 487
on 4/28/12-6:40:48 5 5 28 NA
off 4/28/12-8:16:07 6 5 NA 5719', header=T, stringsAsFactors=F)
设置日期时间变量。如果需要,调整 tz
参数:
df$DT.event <- as.POSIXct(df$DT.event, format = "%m/%d/%y-%H:%M:%S")
library(dplyr)
library(tidyr)
# reshape data
#
df2 <- df %>%
select(Event, DT.event, on.bout.ID) %>%
pivot_wider(names_from = Event,
values_from = DT.event) %>%
select(on.bout.ID, on, off)
df2
是 df
的一些更广泛形式的信息:
on.bout.ID on off
<int> <dttm> <dttm>
1 0 NA 2012-04-27 17:25:13
2 1 2012-04-27 17:25:39 2012-04-27 18:03:29
3 2 2012-04-27 18:03:57 2012-04-27 19:41:16
4 3 2012-04-27 19:43:50 2012-04-28 06:23:57
5 4 2012-04-28 06:32:13 2012-04-28 06:40:20
6 5 2012-04-28 06:40:48 2012-04-28 08:16:07
# Make a copy so we don't mutate the object we are using to iterate
#
df3 <- df2
for (i in seq_along(df2$on.bout.ID)) {
# extract current iterations start and end time
#
id <- df2$on.bout.ID[i]
from <- df2$on[i]
to <- df2$off[i]
# calculate number of rows to insert
#
hoursDiff <- as.numeric(format(to, "%H")) - as.numeric(format(from , "%H"))
# compensate for crossing of midnight (00:00AM)
# by adding 24
#
hoursDiff <- as.integer(difftime(as.Date(to), as.Date(from), unit="days")) * 24 + hoursDiff
# if there is at least on pass of the full hour, insert a copy of the
# current row but adapt on and off times
#
if (!is.na(hoursDiff) & hoursDiff > 0) {
for (hour in 1:hoursDiff) {
# startime of this additional row
#
fromTime <- as.POSIXct(paste0(format(from + 3600 * hour, "%m/%d/%y-%H"), ":00:00"), format="%m/%d/%y-%H:%M:%S")
# Maximal endtime of this additional row
#
toTime <- fromTime + 3599
# copy current line
#
insert <- df2[i, ]
# set start time for this new row to full hour
#
insert$on <- fromTime
# if this is the last row to insert do NOT adapt off time
#
if (!(toTime > to)) {
insert$off <- toTime
}
# add additional row
#
df3 <- rbind(df3, insert)
}
# set off-time for the current line to end of first hour
#
df3[df3$on.bout.ID == id & df3$on == from & df3$off == to,]$off <- as.POSIXct(paste0(format(from, "%m/%d/%y-%H"), ":59:59"), format="%m/%d/%y-%H:%M:%S")
}
}
# Use `dplyr` to sort result
#
library(dplyr)
df3 %>% arrange(on.bout.ID, on)
# A tibble: 21 x 3
on.bout.ID on off
<int> <dttm> <dttm>
1 0 NA 2012-04-27 17:25:13
2 1 2012-04-27 17:25:39 2012-04-27 17:59:59
3 1 2012-04-27 18:00:00 2012-04-27 18:03:29
4 2 2012-04-27 18:03:57 2012-04-27 18:59:59
5 2 2012-04-27 19:00:00 2012-04-27 19:41:16
6 3 2012-04-27 19:43:50 2012-04-27 19:59:59
7 3 2012-04-27 20:00:00 2012-04-27 20:59:59
8 3 2012-04-27 21:00:00 2012-04-27 21:59:59
9 3 2012-04-27 22:00:00 2012-04-27 22:59:59
10 3 2012-04-27 23:00:00 2012-04-27 23:59:59
# … with 11 more rows
漂亮吗?不!
它有效吗?我觉得是
编辑:
已添加
hoursDiff <- as.integer(difftime(as.Date(to), as.Date(from), unit="days")) * 24 + hoursDiff
扩展午夜穿越功能
可以使用 tidyverse 做出比 Dario 更漂亮的解决方案:
读取数据
a =
read.csv(header = F, sep = ";",
col.names = c("Event","DT.event","off.bout.ID","on.bout.ID","off.time.diff","on.time.diff"),
text = gsub(pattern = "\s+{2}",replacement = ";",
x="off 4/27/12 17:25:13 1 0 NA NA
on 4/27/12 17:25:39 1 1 26 NA
off 4/27/12 18:03:29 2 1 NA 2270
on 4/27/12 18:03:57 2 2 28 NA
off 4/27/12 19:41:16 3 2 NA 5839
on 4/27/12 19:43:50 3 3 154 NA
off 4/28/12 6:23:57 4 3 NA 38407
on 4/28/12 6:32:13 4 4 496 NA
off 4/28/12 6:40:20 5 4 NA 487
on 4/28/12 6:40:48 5 5 28 NA
off 4/28/12 8:16:07 6 5 NA 5719"
)
)
a$DT.event <- mdy_hms(a$DT.event)
添加一个包含可能感兴趣的时间的新行
b <- a %>% select(DT.event) %>%
mutate(DT.event = floor_date(DT.event,"hours")) %>%
group_by(DT.event) %>%
summarise() %>%
full_join(a) %>%
arrange(DT.event)
找不同
c <- b %>% fill(Event, .direction = "up") %>%
mutate(on.time.diff.hour = ifelse(Event == "off",
difftime(DT.event, lag(DT.event),
"secs"), NA))
你只需要注意检查天气你在第二行有一个额外的值(因为之前没有)。
结果
# A tibble: 16 x 7
DT.event Event off.bout.ID on.bout.ID off.time.diff on.time.diff on.time.diff.hour
<dttm> <fct> <int> <int> <int> <int> <dbl>
1 2012-04-27 17:00:00 off NA NA NA NA NA
2 2012-04-27 17:25:13 off 1 0 NA NA 1513
3 2012-04-27 17:25:39 on 1 1 26 NA NA
4 2012-04-27 18:00:00 off NA NA NA NA 2061
5 2012-04-27 18:03:29 off 2 1 NA 2270 209
6 2012-04-27 18:03:57 on 2 2 28 NA NA
7 2012-04-27 19:00:00 off NA NA NA NA 3363
8 2012-04-27 19:41:16 off 3 2 NA 5839 2476
9 2012-04-27 19:43:50 on 3 3 154 NA NA
10 2012-04-28 06:00:00 off NA NA NA NA 36970
11 2012-04-28 06:23:57 off 4 3 NA 38407 1437
12 2012-04-28 06:32:13 on 4 4 496 NA NA
13 2012-04-28 06:40:20 off 5 4 NA 487 487
14 2012-04-28 06:40:48 on 5 5 28 NA NA
15 2012-04-28 08:00:00 off NA NA NA NA 4752
16 2012-04-28 08:16:07 off 6 5 NA 5719 967
这是一个使用 data.table
的选项:
#create a lookup table of hourly data (to be dyn, you can use round(min()-1hr) and round(max()+1hr) to generate your hourly data
hourly <- data.table(HOUR=seq(as.POSIXct("20120427 170000", format="%Y%m%d %H%M%S"),
as.POSIXct("20120428 090000", format="%Y%m%d %H%M%S"),
by="1 hour"))[, DT.event := HOUR]
#get end of event from the row below
DT[, endDT.event := shift(DT.event, -1L)]
#perform rolling join to find the closest hour after this event time
DT[, hr_aft := hourly[.SD, on=.(DT.event), roll=-Inf, HOUR]]
#for those that cut across the hour mark, split into 2, if it can be more than 1hr, we can update this part to include that possibility
ovlhr <- DT[hr_aft < endDT.event]
ovlhr <- ovlhr[, .(Event, DT.event=c(DT.event, hr_aft), endDT.event=c(hr_aft, endDT.event)),
1L:nrow(ovlhr)][, (1L) := NULL]
#append both dataset to get final desired output
rbindlist(list(DT[hr_aft>=endDT.event][, hr_aft := NULL], ovlhr))[order(DT.event)]
输出:
Event DT.event endDT.event
1: off 2012-04-27 17:25:13 2012-04-27 17:25:39
2: on 2012-04-27 17:25:39 2012-04-27 18:00:00
3: on 2012-04-27 18:00:00 2012-04-27 18:03:29
4: off 2012-04-27 18:03:29 2012-04-27 18:03:57
5: on 2012-04-27 18:03:57 2012-04-27 19:00:00
6: on 2012-04-27 19:00:00 2012-04-27 19:41:16
7: off 2012-04-27 19:41:16 2012-04-27 19:43:50
8: on 2012-04-27 19:43:50 2012-04-27 20:00:00
9: on 2012-04-27 20:00:00 2012-04-28 06:23:57
10: off 2012-04-28 06:23:57 2012-04-28 06:32:13
11: on 2012-04-28 06:32:13 2012-04-28 06:40:20
12: off 2012-04-28 06:40:20 2012-04-28 06:40:48
13: on 2012-04-28 06:40:48 2012-04-28 07:00:00
14: on 2012-04-28 07:00:00 2012-04-28 08:16:07
数据:
library(data.table)
DT <- fread("Event DT.event
off 4/27/12_17:25:13
on 4/27/12_17:25:39
off 4/27/12_18:03:29
on 4/27/12_18:03:57
off 4/27/12_19:41:16
on 4/27/12_19:43:50
off 4/28/12_6:23:57
on 4/28/12_6:32:13
off 4/28/12_6:40:20
on 4/28/12_6:40:48
off 4/28/12_8:16:07")
我有两种鸟类行为持续时间的视频数据,当鸟在巢上时和鸟离开巢时。对于我的分析,我需要获取每小时上下巢的持续时间。然而,不同的行为常常会重叠小时标记。例如,这只鸟在 4:10-4:42 和 4:50 - 5:20 的巢穴上,我需要将第二个时间段分隔为 4:50-5:00和 5:00-5:20 这样我就可以每小时求和了。我已经用 package lubridate 寻找了很长一段时间,但没有找到这样做的方法,但我认为必须有一些东西在那里。有什么建议吗?
示例数据如下。 "off.time.diff" 是 "off.bout.id" 之间的秒数差异,"on.time.diff" 也是如此。举个例子,小鸟从 17:25:39 到 18:03:29。我可以获得总时间(2270 秒),但无法弄清楚如何将它每小时分开。
Event DT.event off.bout.ID on.bout.ID off.time.diff on.time.diff
off 4/27/12 17:25:13 1 0 NA NA
on 4/27/12 17:25:39 1 1 26 NA
off 4/27/12 18:03:29 2 1 NA 2270
on 4/27/12 18:03:57 2 2 28 NA
off 4/27/12 19:41:16 3 2 NA 5839
on 4/27/12 19:43:50 3 3 154 NA
off 4/28/12 6:23:57 4 3 NA 38407
on 4/28/12 6:32:13 4 4 496 NA
off 4/28/12 6:40:20 5 4 NA 487
on 4/28/12 6:40:48 5 5 28 NA
off 4/28/12 8:16:07 6 5 NA 5719
这是一个想法
library(dplyr)
library(lubridate)
# Yours data
a =
read.csv(header = F, sep = ";", stringsAsFactors = F,
col.names = c("Event","DT.event","off.bout.ID","on.bout.ID","off.time.diff","on.time.diff"),
text = gsub(pattern = "\s+{2}",replacement = ";",
x="off 4/27/12 17:25:13 1 0 NA NA
on 4/27/12 17:25:39 1 1 26 NA
off 4/27/12 18:03:29 2 1 NA 2270
on 4/27/12 18:03:57 2 2 28 NA
off 4/27/12 19:41:16 3 2 NA 5839
on 4/27/12 19:43:50 3 3 154 NA
off 4/28/12 6:23:57 4 3 NA 38407
on 4/28/12 6:32:13 4 4 496 NA
off 4/28/12 6:40:20 5 4 NA 487
on 4/28/12 6:40:48 5 5 28 NA
off 4/28/12 8:16:07 6 5 NA 5719"
)
) %>% mutate(DT.event = as.POSIXct(DT.event, format = "%m/%d/%Y %H:%M:%S")
)
# Ordering by time, if it isn't ordered
a = a[order(a$DT.event),]
# Build a trick column to calculate time difs with 'next_event'
a[,"next_eve"] = as.POSIXct(c(a$DT.event[2:nrow(a)],NA))
# Build column with time difference by "complete" hours
a = a %>%
mutate(dif_comp_hour_sec =
case_when(
floor_date(next_eve,unit = "hour") > floor_date(next_eve,unit = "hour") ~ as.numeric(floor_date(next_eve,unit = "hour") - DT.event),
T ~ as.numeric(next_eve - DT.event )
)
)
如果需要,您可以使用列 "Event" 再次拆分为 on/off 列。
此处输出:
# Event DT.event off.bout.ID on.bout.ID off.time.diff on.time.diff next_eve dif_comp_hour_sec
# 1 off 12-04-27 17:25:13 1 0 NA NA 12-04-27 17:25:39 26
# 2 on 12-04-27 17:25:39 1 1 26 NA 12-04-27 18:03:29 2270
# 3 off 12-04-27 18:03:29 2 1 NA 2270 12-04-27 18:03:57 28
# 4 on 12-04-27 18:03:57 2 2 28 NA 12-04-27 19:41:16 5839
# 5 off 12-04-27 19:41:16 3 2 NA 5839 12-04-27 19:43:50 154
# 6 on 12-04-27 19:43:50 3 3 154 NA 12-04-28 06:23:57 38407
# 7 off 12-04-28 06:23:57 4 3 NA 38407 12-04-28 06:32:13 496
# 8 on 12-04-28 06:32:13 4 4 496 NA 12-04-28 06:40:20 487
# 9 off 12-04-28 06:40:20 5 4 NA 487 12-04-28 06:40:48 28
# 10 on 12-04-28 06:40:48 5 5 28 NA 12-04-28 08:16:07 5719
# 11 off 12-04-28 08:16:07 6 5 NA 5719 <NA> NA
我的建议背后的想法是检查每个事件经过了多少个完整的小时标记,并为每个小时插入一个额外的行并相应地更改时间...
加载示例数据:
df <- read.table(text='Event DT.event off.bout.ID on.bout.ID off.time.diff on.time.diff
off 4/27/12-17:25:13 1 0 NA NA
on 4/27/12-17:25:39 1 1 26 NA
off 4/27/12-18:03:29 2 1 NA 2270
on 4/27/12-18:03:57 2 2 28 NA
off 4/27/12-19:41:16 3 2 NA 5839
on 4/27/12-19:43:50 3 3 154 NA
off 4/28/12-6:23:57 4 3 NA 38407
on 4/28/12-6:32:13 4 4 496 NA
off 4/28/12-6:40:20 5 4 NA 487
on 4/28/12-6:40:48 5 5 28 NA
off 4/28/12-8:16:07 6 5 NA 5719', header=T, stringsAsFactors=F)
设置日期时间变量。如果需要,调整 tz
参数:
df$DT.event <- as.POSIXct(df$DT.event, format = "%m/%d/%y-%H:%M:%S")
library(dplyr)
library(tidyr)
# reshape data
#
df2 <- df %>%
select(Event, DT.event, on.bout.ID) %>%
pivot_wider(names_from = Event,
values_from = DT.event) %>%
select(on.bout.ID, on, off)
df2
是 df
的一些更广泛形式的信息:
on.bout.ID on off <int> <dttm> <dttm> 1 0 NA 2012-04-27 17:25:13 2 1 2012-04-27 17:25:39 2012-04-27 18:03:29 3 2 2012-04-27 18:03:57 2012-04-27 19:41:16 4 3 2012-04-27 19:43:50 2012-04-28 06:23:57 5 4 2012-04-28 06:32:13 2012-04-28 06:40:20 6 5 2012-04-28 06:40:48 2012-04-28 08:16:07
# Make a copy so we don't mutate the object we are using to iterate
#
df3 <- df2
for (i in seq_along(df2$on.bout.ID)) {
# extract current iterations start and end time
#
id <- df2$on.bout.ID[i]
from <- df2$on[i]
to <- df2$off[i]
# calculate number of rows to insert
#
hoursDiff <- as.numeric(format(to, "%H")) - as.numeric(format(from , "%H"))
# compensate for crossing of midnight (00:00AM)
# by adding 24
#
hoursDiff <- as.integer(difftime(as.Date(to), as.Date(from), unit="days")) * 24 + hoursDiff
# if there is at least on pass of the full hour, insert a copy of the
# current row but adapt on and off times
#
if (!is.na(hoursDiff) & hoursDiff > 0) {
for (hour in 1:hoursDiff) {
# startime of this additional row
#
fromTime <- as.POSIXct(paste0(format(from + 3600 * hour, "%m/%d/%y-%H"), ":00:00"), format="%m/%d/%y-%H:%M:%S")
# Maximal endtime of this additional row
#
toTime <- fromTime + 3599
# copy current line
#
insert <- df2[i, ]
# set start time for this new row to full hour
#
insert$on <- fromTime
# if this is the last row to insert do NOT adapt off time
#
if (!(toTime > to)) {
insert$off <- toTime
}
# add additional row
#
df3 <- rbind(df3, insert)
}
# set off-time for the current line to end of first hour
#
df3[df3$on.bout.ID == id & df3$on == from & df3$off == to,]$off <- as.POSIXct(paste0(format(from, "%m/%d/%y-%H"), ":59:59"), format="%m/%d/%y-%H:%M:%S")
}
}
# Use `dplyr` to sort result
#
library(dplyr)
df3 %>% arrange(on.bout.ID, on)
# A tibble: 21 x 3 on.bout.ID on off <int> <dttm> <dttm> 1 0 NA 2012-04-27 17:25:13 2 1 2012-04-27 17:25:39 2012-04-27 17:59:59 3 1 2012-04-27 18:00:00 2012-04-27 18:03:29 4 2 2012-04-27 18:03:57 2012-04-27 18:59:59 5 2 2012-04-27 19:00:00 2012-04-27 19:41:16 6 3 2012-04-27 19:43:50 2012-04-27 19:59:59 7 3 2012-04-27 20:00:00 2012-04-27 20:59:59 8 3 2012-04-27 21:00:00 2012-04-27 21:59:59 9 3 2012-04-27 22:00:00 2012-04-27 22:59:59 10 3 2012-04-27 23:00:00 2012-04-27 23:59:59 # … with 11 more rows
漂亮吗?不! 它有效吗?我觉得是
编辑:
已添加
hoursDiff <- as.integer(difftime(as.Date(to), as.Date(from), unit="days")) * 24 + hoursDiff
扩展午夜穿越功能
可以使用 tidyverse 做出比 Dario 更漂亮的解决方案:
读取数据
a =
read.csv(header = F, sep = ";",
col.names = c("Event","DT.event","off.bout.ID","on.bout.ID","off.time.diff","on.time.diff"),
text = gsub(pattern = "\s+{2}",replacement = ";",
x="off 4/27/12 17:25:13 1 0 NA NA
on 4/27/12 17:25:39 1 1 26 NA
off 4/27/12 18:03:29 2 1 NA 2270
on 4/27/12 18:03:57 2 2 28 NA
off 4/27/12 19:41:16 3 2 NA 5839
on 4/27/12 19:43:50 3 3 154 NA
off 4/28/12 6:23:57 4 3 NA 38407
on 4/28/12 6:32:13 4 4 496 NA
off 4/28/12 6:40:20 5 4 NA 487
on 4/28/12 6:40:48 5 5 28 NA
off 4/28/12 8:16:07 6 5 NA 5719"
)
)
a$DT.event <- mdy_hms(a$DT.event)
添加一个包含可能感兴趣的时间的新行
b <- a %>% select(DT.event) %>%
mutate(DT.event = floor_date(DT.event,"hours")) %>%
group_by(DT.event) %>%
summarise() %>%
full_join(a) %>%
arrange(DT.event)
找不同
c <- b %>% fill(Event, .direction = "up") %>%
mutate(on.time.diff.hour = ifelse(Event == "off",
difftime(DT.event, lag(DT.event),
"secs"), NA))
你只需要注意检查天气你在第二行有一个额外的值(因为之前没有)。
结果
# A tibble: 16 x 7
DT.event Event off.bout.ID on.bout.ID off.time.diff on.time.diff on.time.diff.hour
<dttm> <fct> <int> <int> <int> <int> <dbl>
1 2012-04-27 17:00:00 off NA NA NA NA NA
2 2012-04-27 17:25:13 off 1 0 NA NA 1513
3 2012-04-27 17:25:39 on 1 1 26 NA NA
4 2012-04-27 18:00:00 off NA NA NA NA 2061
5 2012-04-27 18:03:29 off 2 1 NA 2270 209
6 2012-04-27 18:03:57 on 2 2 28 NA NA
7 2012-04-27 19:00:00 off NA NA NA NA 3363
8 2012-04-27 19:41:16 off 3 2 NA 5839 2476
9 2012-04-27 19:43:50 on 3 3 154 NA NA
10 2012-04-28 06:00:00 off NA NA NA NA 36970
11 2012-04-28 06:23:57 off 4 3 NA 38407 1437
12 2012-04-28 06:32:13 on 4 4 496 NA NA
13 2012-04-28 06:40:20 off 5 4 NA 487 487
14 2012-04-28 06:40:48 on 5 5 28 NA NA
15 2012-04-28 08:00:00 off NA NA NA NA 4752
16 2012-04-28 08:16:07 off 6 5 NA 5719 967
这是一个使用 data.table
的选项:
#create a lookup table of hourly data (to be dyn, you can use round(min()-1hr) and round(max()+1hr) to generate your hourly data
hourly <- data.table(HOUR=seq(as.POSIXct("20120427 170000", format="%Y%m%d %H%M%S"),
as.POSIXct("20120428 090000", format="%Y%m%d %H%M%S"),
by="1 hour"))[, DT.event := HOUR]
#get end of event from the row below
DT[, endDT.event := shift(DT.event, -1L)]
#perform rolling join to find the closest hour after this event time
DT[, hr_aft := hourly[.SD, on=.(DT.event), roll=-Inf, HOUR]]
#for those that cut across the hour mark, split into 2, if it can be more than 1hr, we can update this part to include that possibility
ovlhr <- DT[hr_aft < endDT.event]
ovlhr <- ovlhr[, .(Event, DT.event=c(DT.event, hr_aft), endDT.event=c(hr_aft, endDT.event)),
1L:nrow(ovlhr)][, (1L) := NULL]
#append both dataset to get final desired output
rbindlist(list(DT[hr_aft>=endDT.event][, hr_aft := NULL], ovlhr))[order(DT.event)]
输出:
Event DT.event endDT.event
1: off 2012-04-27 17:25:13 2012-04-27 17:25:39
2: on 2012-04-27 17:25:39 2012-04-27 18:00:00
3: on 2012-04-27 18:00:00 2012-04-27 18:03:29
4: off 2012-04-27 18:03:29 2012-04-27 18:03:57
5: on 2012-04-27 18:03:57 2012-04-27 19:00:00
6: on 2012-04-27 19:00:00 2012-04-27 19:41:16
7: off 2012-04-27 19:41:16 2012-04-27 19:43:50
8: on 2012-04-27 19:43:50 2012-04-27 20:00:00
9: on 2012-04-27 20:00:00 2012-04-28 06:23:57
10: off 2012-04-28 06:23:57 2012-04-28 06:32:13
11: on 2012-04-28 06:32:13 2012-04-28 06:40:20
12: off 2012-04-28 06:40:20 2012-04-28 06:40:48
13: on 2012-04-28 06:40:48 2012-04-28 07:00:00
14: on 2012-04-28 07:00:00 2012-04-28 08:16:07
数据:
library(data.table)
DT <- fread("Event DT.event
off 4/27/12_17:25:13
on 4/27/12_17:25:39
off 4/27/12_18:03:29
on 4/27/12_18:03:57
off 4/27/12_19:41:16
on 4/27/12_19:43:50
off 4/28/12_6:23:57
on 4/28/12_6:32:13
off 4/28/12_6:40:20
on 4/28/12_6:40:48
off 4/28/12_8:16:07")