已更新但仍无法正常工作 - 识别时间序列中的中断并为 R 中的每个中断分配唯一因子
Updated but still not working - Identfy breaks in time series and assign unique factor for each break in R
我之前问过这个问题 (),建议的解决方案当时有效,但我现在发现它并非在所有情况下都有效。我一直在试用该解决方案的变体,但没有成功。这里再次是简化的问题:
我有一个日期时间系列的船只位置,时间系列中有很大的差距。间隙代表血管轨道的断裂。我想为每条轨道添加一个唯一标识符。这是一些真实的数据;
time<-c("2019-01-23 00:33:58 GMT", "2019-01-23 12:10:27 GMT", "2019-01-23 13:49:29 GMT",
"2019-01-23 15:29:27 GMT", "2019-01-23 18:35:27 GMT", "2019-01-23 21:38:29 GMT",
"2019-01-28 14:52:10 GMT", "2019-01-28 16:31:37 GMT", "2019-01-28 18:07:40 GMT",
"2019-01-28 19:46:40 GMT", "2019-01-28 21:22:40 GMT", "2019-01-29 13:53:48 GMT",
"2019-01-29 15:25:48 GMT", "2019-01-29 18:43:54 GMT", "2019-01-29 20:19:56 GMT",
"2019-01-29 21:56:27 GMT", "2019-02-03 14:57:01 GMT", "2019-02-04 15:45:56 GMT",
"2019-02-04 16:49:57 GMT", "2019-02-05 17:46:05 GMT", "2019-02-05 18:03:06 GMT",
"2019-02-08 16:05:59 GMT", "2019-02-08 17:42:59 GMT", "2019-02-09 02:47:00 GMT")
我将 'track' 定义为相差小于 12 小时的连续点,大于 12 小时的任何点都被视为下一首曲目的一部分。使用以下代码找到中断点并唯一标记每个曲目,并将 'delete' 放置在无法与曲目关联的一次性点中。
library(dplyr)
library(stringr)
vessel<-rep(1, length(time))
df<-data.frame(vessel,time)
df$gap <- round(c(0, difftime(time[-1],time[-nrow(df)], units = "hours")),2)
df$within_thresh <- df$gap < 12 #12 hours difference
df %>%
mutate(split_factor = inverse.rle(within.list(rle(within_thresh),
values[values] <- str_c('track_', seq_along(values[values])))),
split_factor = replace(split_factor, !as.logical(split_factor), 'delete'))
问题在于它在实际上是下一首曲目的开始时间旁边放置了一个 'delete',例如请参阅此处的结果中的第 7 行应显示为 'track_2',或第 12 行应显示为 'track_3'。我想 'delete' 的一个真正要点是第 17 行的一次性位置。
vessel time gap within_thresh split_factor
1 1 2019-01-23 00:33:58 GMT 0.00 TRUE track_1
2 1 2019-01-23 12:10:27 GMT 11.61 TRUE track_1
3 1 2019-01-23 13:49:29 GMT 1.65 TRUE track_1
4 1 2019-01-23 15:29:27 GMT 1.67 TRUE track_1
5 1 2019-01-23 18:35:27 GMT 3.10 TRUE track_1
6 1 2019-01-23 21:38:29 GMT 3.05 TRUE track_1
7 1 2019-01-28 14:52:10 GMT 113.23 FALSE delete # actually track_2
8 1 2019-01-28 16:31:37 GMT 1.66 TRUE track_2
9 1 2019-01-28 18:07:40 GMT 1.60 TRUE track_2
10 1 2019-01-28 19:46:40 GMT 1.65 TRUE track_2
11 1 2019-01-28 21:22:40 GMT 1.60 TRUE track_2
12 1 2019-01-29 13:53:48 GMT 16.52 FALSE delete # actually track_3
13 1 2019-01-29 15:25:48 GMT 1.53 TRUE track_3
14 1 2019-01-29 18:43:54 GMT 3.30 TRUE track_3
15 1 2019-01-29 20:19:56 GMT 1.60 TRUE track_3
16 1 2019-01-29 21:56:27 GMT 1.61 TRUE track_3
17 1 2019-02-03 14:57:01 GMT 113.01 FALSE delete # correct, as is a one-off
18 1 2019-02-04 15:45:56 GMT 24.82 FALSE delete # actually track_4
19 1 2019-02-04 16:49:57 GMT 1.07 TRUE track_4
20 1 2019-02-05 17:46:05 GMT 24.94 FALSE delete # actually track_5
21 1 2019-02-05 18:03:06 GMT 0.28 TRUE track_5
22 1 2019-02-08 16:05:59 GMT 70.05 FALSE delete # actually track_6
23 1 2019-02-08 17:42:59 GMT 1.62 TRUE track_6
24 1 2019-02-09 02:47:00 GMT 9.07 TRUE track_6
最初的用户建议包括一个 | (或)语句 (rle(within_thresh|lead(within_thresh)
但这只标识了本例中的 2 个轨道。
欢迎提出任何建议。
您之所以会出现这种情况,是因为您只检查了每次之前的时间段,而不是之后的时间段。这是一个包含更多条件检查的变通方法,但 returns 正确的轨道每隔 12 小时中断一次并标识已删除的点。
df %>%
mutate(before = round(as.numeric(difftime(time, lag(time), units = "hours")), 2),
after = round(as.numeric(difftime(lead(time), time, units = "hours")), 2)) %>%
replace(is.na(.), 0) %>%
mutate(before = inverse.rle(within.list(rle(before < 12),
values[values] <- str_c('track_',
seq_along(values[values])))),
after = inverse.rle(within.list(rle(after < 12),
values[values] <- str_c('track_',
seq_along(values[values]))))) %>%
mutate(split_factor = case_when(before == "FALSE" ~ if_else(after == "FALSE", "delete", after),
TRUE ~ before)) %>%
select(vessel, time, split_factor)
vessel time split_factor
1 1 2019-01-23 00:33:58 track_1
2 1 2019-01-23 12:10:27 track_1
3 1 2019-01-23 13:49:29 track_1
4 1 2019-01-23 15:29:27 track_1
5 1 2019-01-23 18:35:27 track_1
6 1 2019-01-23 21:38:29 track_1
7 1 2019-01-28 14:52:10 track_2
8 1 2019-01-28 16:31:37 track_2
9 1 2019-01-28 18:07:40 track_2
10 1 2019-01-28 19:46:40 track_2
11 1 2019-01-28 21:22:40 track_2
12 1 2019-01-29 13:53:48 track_3
13 1 2019-01-29 15:25:48 track_3
14 1 2019-01-29 18:43:54 track_3
15 1 2019-01-29 20:19:56 track_3
16 1 2019-01-29 21:56:27 track_3
17 1 2019-02-03 14:57:01 delete
18 1 2019-02-04 15:45:56 track_4
19 1 2019-02-04 16:49:57 track_4
20 1 2019-02-05 17:46:05 track_5
21 1 2019-02-05 18:03:06 track_5
22 1 2019-02-08 16:05:59 track_6
23 1 2019-02-08 17:42:59 track_6
24 1 2019-02-09 02:47:00 track_6
怎么样:
with_id <- df %>%
mutate(track_id = if_else(gap > 12,1,0),
track_id = cumsum(track_id)) %>%
group_by(track_id) %>%
filter(n()>1)
如果你想让track_id连续并从1开始,你可以这样结束:
with_id$track_id <- with_id %>% group_indices()
基本思想是在我们开始新曲目的地方标记 1。然后cumsum
将标记不是起点的行。
我之前问过这个问题 (
我有一个日期时间系列的船只位置,时间系列中有很大的差距。间隙代表血管轨道的断裂。我想为每条轨道添加一个唯一标识符。这是一些真实的数据;
time<-c("2019-01-23 00:33:58 GMT", "2019-01-23 12:10:27 GMT", "2019-01-23 13:49:29 GMT",
"2019-01-23 15:29:27 GMT", "2019-01-23 18:35:27 GMT", "2019-01-23 21:38:29 GMT",
"2019-01-28 14:52:10 GMT", "2019-01-28 16:31:37 GMT", "2019-01-28 18:07:40 GMT",
"2019-01-28 19:46:40 GMT", "2019-01-28 21:22:40 GMT", "2019-01-29 13:53:48 GMT",
"2019-01-29 15:25:48 GMT", "2019-01-29 18:43:54 GMT", "2019-01-29 20:19:56 GMT",
"2019-01-29 21:56:27 GMT", "2019-02-03 14:57:01 GMT", "2019-02-04 15:45:56 GMT",
"2019-02-04 16:49:57 GMT", "2019-02-05 17:46:05 GMT", "2019-02-05 18:03:06 GMT",
"2019-02-08 16:05:59 GMT", "2019-02-08 17:42:59 GMT", "2019-02-09 02:47:00 GMT")
我将 'track' 定义为相差小于 12 小时的连续点,大于 12 小时的任何点都被视为下一首曲目的一部分。使用以下代码找到中断点并唯一标记每个曲目,并将 'delete' 放置在无法与曲目关联的一次性点中。
library(dplyr)
library(stringr)
vessel<-rep(1, length(time))
df<-data.frame(vessel,time)
df$gap <- round(c(0, difftime(time[-1],time[-nrow(df)], units = "hours")),2)
df$within_thresh <- df$gap < 12 #12 hours difference
df %>%
mutate(split_factor = inverse.rle(within.list(rle(within_thresh),
values[values] <- str_c('track_', seq_along(values[values])))),
split_factor = replace(split_factor, !as.logical(split_factor), 'delete'))
问题在于它在实际上是下一首曲目的开始时间旁边放置了一个 'delete',例如请参阅此处的结果中的第 7 行应显示为 'track_2',或第 12 行应显示为 'track_3'。我想 'delete' 的一个真正要点是第 17 行的一次性位置。
vessel time gap within_thresh split_factor
1 1 2019-01-23 00:33:58 GMT 0.00 TRUE track_1
2 1 2019-01-23 12:10:27 GMT 11.61 TRUE track_1
3 1 2019-01-23 13:49:29 GMT 1.65 TRUE track_1
4 1 2019-01-23 15:29:27 GMT 1.67 TRUE track_1
5 1 2019-01-23 18:35:27 GMT 3.10 TRUE track_1
6 1 2019-01-23 21:38:29 GMT 3.05 TRUE track_1
7 1 2019-01-28 14:52:10 GMT 113.23 FALSE delete # actually track_2
8 1 2019-01-28 16:31:37 GMT 1.66 TRUE track_2
9 1 2019-01-28 18:07:40 GMT 1.60 TRUE track_2
10 1 2019-01-28 19:46:40 GMT 1.65 TRUE track_2
11 1 2019-01-28 21:22:40 GMT 1.60 TRUE track_2
12 1 2019-01-29 13:53:48 GMT 16.52 FALSE delete # actually track_3
13 1 2019-01-29 15:25:48 GMT 1.53 TRUE track_3
14 1 2019-01-29 18:43:54 GMT 3.30 TRUE track_3
15 1 2019-01-29 20:19:56 GMT 1.60 TRUE track_3
16 1 2019-01-29 21:56:27 GMT 1.61 TRUE track_3
17 1 2019-02-03 14:57:01 GMT 113.01 FALSE delete # correct, as is a one-off
18 1 2019-02-04 15:45:56 GMT 24.82 FALSE delete # actually track_4
19 1 2019-02-04 16:49:57 GMT 1.07 TRUE track_4
20 1 2019-02-05 17:46:05 GMT 24.94 FALSE delete # actually track_5
21 1 2019-02-05 18:03:06 GMT 0.28 TRUE track_5
22 1 2019-02-08 16:05:59 GMT 70.05 FALSE delete # actually track_6
23 1 2019-02-08 17:42:59 GMT 1.62 TRUE track_6
24 1 2019-02-09 02:47:00 GMT 9.07 TRUE track_6
最初的用户建议包括一个 | (或)语句 (rle(within_thresh|lead(within_thresh)
但这只标识了本例中的 2 个轨道。
欢迎提出任何建议。
您之所以会出现这种情况,是因为您只检查了每次之前的时间段,而不是之后的时间段。这是一个包含更多条件检查的变通方法,但 returns 正确的轨道每隔 12 小时中断一次并标识已删除的点。
df %>%
mutate(before = round(as.numeric(difftime(time, lag(time), units = "hours")), 2),
after = round(as.numeric(difftime(lead(time), time, units = "hours")), 2)) %>%
replace(is.na(.), 0) %>%
mutate(before = inverse.rle(within.list(rle(before < 12),
values[values] <- str_c('track_',
seq_along(values[values])))),
after = inverse.rle(within.list(rle(after < 12),
values[values] <- str_c('track_',
seq_along(values[values]))))) %>%
mutate(split_factor = case_when(before == "FALSE" ~ if_else(after == "FALSE", "delete", after),
TRUE ~ before)) %>%
select(vessel, time, split_factor)
vessel time split_factor
1 1 2019-01-23 00:33:58 track_1
2 1 2019-01-23 12:10:27 track_1
3 1 2019-01-23 13:49:29 track_1
4 1 2019-01-23 15:29:27 track_1
5 1 2019-01-23 18:35:27 track_1
6 1 2019-01-23 21:38:29 track_1
7 1 2019-01-28 14:52:10 track_2
8 1 2019-01-28 16:31:37 track_2
9 1 2019-01-28 18:07:40 track_2
10 1 2019-01-28 19:46:40 track_2
11 1 2019-01-28 21:22:40 track_2
12 1 2019-01-29 13:53:48 track_3
13 1 2019-01-29 15:25:48 track_3
14 1 2019-01-29 18:43:54 track_3
15 1 2019-01-29 20:19:56 track_3
16 1 2019-01-29 21:56:27 track_3
17 1 2019-02-03 14:57:01 delete
18 1 2019-02-04 15:45:56 track_4
19 1 2019-02-04 16:49:57 track_4
20 1 2019-02-05 17:46:05 track_5
21 1 2019-02-05 18:03:06 track_5
22 1 2019-02-08 16:05:59 track_6
23 1 2019-02-08 17:42:59 track_6
24 1 2019-02-09 02:47:00 track_6
怎么样:
with_id <- df %>%
mutate(track_id = if_else(gap > 12,1,0),
track_id = cumsum(track_id)) %>%
group_by(track_id) %>%
filter(n()>1)
如果你想让track_id连续并从1开始,你可以这样结束:
with_id$track_id <- with_id %>% group_indices()
基本思想是在我们开始新曲目的地方标记 1。然后cumsum
将标记不是起点的行。