用dplyr将"start stop data"(a.k.a.转码)转成长格式(a.k.a.时间码)
Transform "start stop data" (a.k.a. turn codes) into long format (a.k.a. time codes) with dplyr
我想像这样转换转码
library(tidyverse)
library(lubridate)
turndata_wide <- tibble(turnID = 1:4,
code = c("a", "b", "a", "g"),
start = c(ymd_hms("2019_05_25 00:00:05"),
ymd_hms("2019_05_25 00:00:02"),
ymd_hms("2019_05_25 00:00:10"),
ymd_hms("2019_05_25 00:00:01")),
end = c(ymd_hms("2019_05_25 00:00:08"),
ymd_hms("2019_05_25 00:00:07"),
ymd_hms("2019_05_25 00:00:15"),
ymd_hms("2019_05_25 00:00:25")))
这导致了这个
> turndata_wide
# A tibble: 4 x 4
turnID code start end
<int> <chr> <dttm> <dttm>
1 1 a 2019-05-25 00:00:05 2019-05-25 00:00:08
2 2 b 2019-05-25 00:00:02 2019-05-25 00:00:07
3 3 a 2019-05-25 00:00:10 2019-05-25 00:00:15
4 4 g 2019-05-25 00:00:01 2019-05-25 00:00:25
进入我们(社会科学家)所说的时间码。这应该看起来像
# A tibble: 25 x 4
time a b g
<dttm> <dbl> <dbl> <dbl>
1 2019-05-25 00:00:01 NA NA 1
2 2019-05-25 00:00:02 NA 1 1
3 2019-05-25 00:00:03 NA 1 1
4 2019-05-25 00:00:04 NA 1 1
5 2019-05-25 00:00:05 1 1 1
6 2019-05-25 00:00:06 1 1 1
7 2019-05-25 00:00:07 1 1 1
8 2019-05-25 00:00:08 1 NA 1
9 2019-05-25 00:00:09 NA NA 1
10 2019-05-25 00:00:10 1 NA 1
# … with 15 more rows
我构建了一个可行的(行人和丑陋的)解决方案,但我很确定,还有更多更好的解决方案。我的(丑陋的)方法是:
- 每回合创造 long_df
- 每回合 "full time rows" 加入一个 df
- 加入论文 full_dfs 每回合
- 传播代码
## Loop over steps 1) + 2) ########################################
df_per_turn_list <- list()
for(i in 1:nrow(turndata_wide)){
data_turn_temp <- turndata_wide[i,]%>%
gather(startend, time, start, end)%>%
full_join(.,
tibble(time = seq.POSIXt(from = min(.$time),
to = max(.$time),
by = "sec"),
code = .$code[1],
turnID = .$turnID[1]))%>%
select(-startend)%>%
arrange(time)
temp_name <- paste("data_turn_", i, sep = "")
df_per_turn_list[[temp_name]] <- data_turn_temp
}
## Steps 3) + 4): Join dfs_per turn and spread codes ########
reduce(df_per_turn_list, full_join)%>%
mutate(dummy_one = 1)%>%
select(-turnID)%>%
spread(code, dummy_one)%>%
arrange(time)
使用 tidyverse
和 splitstackshape
中的 cSplit_e
的一种方法。我们每秒在 start
和 end
之间创建一个序列,每秒创建 group_by
并将其转换为逗号分隔值,然后使用 cSplit_e
将它们转换为二进制列。
library(tidyverse)
turndata_wide %>%
mutate(time = map2(start, end, seq, by = "1 sec")) %>%
unnest(cols = time) %>%
select(-start, -end) %>%
group_by(time) %>%
summarise(code = toString(code)) %>%
splitstackshape::cSplit_e("code", type = "character", drop = TRUE)
其中 returns 输出为:
# time code_a code_b code_g
#1 2019-05-25 00:00:01 NA NA 1
#2 2019-05-25 00:00:02 NA 1 1
#3 2019-05-25 00:00:03 NA 1 1
#4 2019-05-25 00:00:04 NA 1 1
#5 2019-05-25 00:00:05 1 1 1
#6 2019-05-25 00:00:06 1 1 1
#7 2019-05-25 00:00:07 1 1 1
#8 2019-05-25 00:00:08 1 NA 1
#9 2019-05-25 00:00:09 NA NA 1
#10 2019-05-25 00:00:10 1 NA 1
#11 2019-05-25 00:00:11 1 NA 1
#12 2019-05-25 00:00:12 1 NA 1
#13 2019-05-25 00:00:13 1 NA 1
#14 2019-05-25 00:00:14 1 NA 1
#15 2019-05-25 00:00:15 1 NA 1
#16 2019-05-25 00:00:16 NA NA 1
#17 2019-05-25 00:00:17 NA NA 1
#18 2019-05-25 00:00:18 NA NA 1
#19 2019-05-25 00:00:19 NA NA 1
#20 2019-05-25 00:00:20 NA NA 1
#21 2019-05-25 00:00:21 NA NA 1
#22 2019-05-25 00:00:22 NA NA 1
#23 2019-05-25 00:00:23 NA NA 1
#24 2019-05-25 00:00:24 NA NA 1
#25 2019-05-25 00:00:25 NA NA 1
我想像这样转换转码
library(tidyverse)
library(lubridate)
turndata_wide <- tibble(turnID = 1:4,
code = c("a", "b", "a", "g"),
start = c(ymd_hms("2019_05_25 00:00:05"),
ymd_hms("2019_05_25 00:00:02"),
ymd_hms("2019_05_25 00:00:10"),
ymd_hms("2019_05_25 00:00:01")),
end = c(ymd_hms("2019_05_25 00:00:08"),
ymd_hms("2019_05_25 00:00:07"),
ymd_hms("2019_05_25 00:00:15"),
ymd_hms("2019_05_25 00:00:25")))
这导致了这个
> turndata_wide
# A tibble: 4 x 4
turnID code start end
<int> <chr> <dttm> <dttm>
1 1 a 2019-05-25 00:00:05 2019-05-25 00:00:08
2 2 b 2019-05-25 00:00:02 2019-05-25 00:00:07
3 3 a 2019-05-25 00:00:10 2019-05-25 00:00:15
4 4 g 2019-05-25 00:00:01 2019-05-25 00:00:25
进入我们(社会科学家)所说的时间码。这应该看起来像
# A tibble: 25 x 4
time a b g
<dttm> <dbl> <dbl> <dbl>
1 2019-05-25 00:00:01 NA NA 1
2 2019-05-25 00:00:02 NA 1 1
3 2019-05-25 00:00:03 NA 1 1
4 2019-05-25 00:00:04 NA 1 1
5 2019-05-25 00:00:05 1 1 1
6 2019-05-25 00:00:06 1 1 1
7 2019-05-25 00:00:07 1 1 1
8 2019-05-25 00:00:08 1 NA 1
9 2019-05-25 00:00:09 NA NA 1
10 2019-05-25 00:00:10 1 NA 1
# … with 15 more rows
我构建了一个可行的(行人和丑陋的)解决方案,但我很确定,还有更多更好的解决方案。我的(丑陋的)方法是:
- 每回合创造 long_df
- 每回合 "full time rows" 加入一个 df
- 加入论文 full_dfs 每回合
- 传播代码
## Loop over steps 1) + 2) ########################################
df_per_turn_list <- list()
for(i in 1:nrow(turndata_wide)){
data_turn_temp <- turndata_wide[i,]%>%
gather(startend, time, start, end)%>%
full_join(.,
tibble(time = seq.POSIXt(from = min(.$time),
to = max(.$time),
by = "sec"),
code = .$code[1],
turnID = .$turnID[1]))%>%
select(-startend)%>%
arrange(time)
temp_name <- paste("data_turn_", i, sep = "")
df_per_turn_list[[temp_name]] <- data_turn_temp
}
## Steps 3) + 4): Join dfs_per turn and spread codes ########
reduce(df_per_turn_list, full_join)%>%
mutate(dummy_one = 1)%>%
select(-turnID)%>%
spread(code, dummy_one)%>%
arrange(time)
使用 tidyverse
和 splitstackshape
中的 cSplit_e
的一种方法。我们每秒在 start
和 end
之间创建一个序列,每秒创建 group_by
并将其转换为逗号分隔值,然后使用 cSplit_e
将它们转换为二进制列。
library(tidyverse)
turndata_wide %>%
mutate(time = map2(start, end, seq, by = "1 sec")) %>%
unnest(cols = time) %>%
select(-start, -end) %>%
group_by(time) %>%
summarise(code = toString(code)) %>%
splitstackshape::cSplit_e("code", type = "character", drop = TRUE)
其中 returns 输出为:
# time code_a code_b code_g
#1 2019-05-25 00:00:01 NA NA 1
#2 2019-05-25 00:00:02 NA 1 1
#3 2019-05-25 00:00:03 NA 1 1
#4 2019-05-25 00:00:04 NA 1 1
#5 2019-05-25 00:00:05 1 1 1
#6 2019-05-25 00:00:06 1 1 1
#7 2019-05-25 00:00:07 1 1 1
#8 2019-05-25 00:00:08 1 NA 1
#9 2019-05-25 00:00:09 NA NA 1
#10 2019-05-25 00:00:10 1 NA 1
#11 2019-05-25 00:00:11 1 NA 1
#12 2019-05-25 00:00:12 1 NA 1
#13 2019-05-25 00:00:13 1 NA 1
#14 2019-05-25 00:00:14 1 NA 1
#15 2019-05-25 00:00:15 1 NA 1
#16 2019-05-25 00:00:16 NA NA 1
#17 2019-05-25 00:00:17 NA NA 1
#18 2019-05-25 00:00:18 NA NA 1
#19 2019-05-25 00:00:19 NA NA 1
#20 2019-05-25 00:00:20 NA NA 1
#21 2019-05-25 00:00:21 NA NA 1
#22 2019-05-25 00:00:22 NA NA 1
#23 2019-05-25 00:00:23 NA NA 1
#24 2019-05-25 00:00:24 NA NA 1
#25 2019-05-25 00:00:25 NA NA 1