在组内计算值变化前后的值,为每个独特的转变生成新变量
counting values after and before change in value, within groups, generating new variables for each unique shift
我正在寻找一种方法,在 id
组内,计算数据 datatbl
.
中 TF
值偏移的唯一出现次数
我想从 TF
在 1
和 0
或 o
和 1
之间变化时向前和向后计数。计数将存储在一个新变量 PM##
中,以便 PM##
保存 TF
中的每个唯一移位,包括加号和减号。下面的 MWE 导致了晚上 7 点的结果,但我的生产数据可以有 15 个或更多班次。如果 TF
值在 NA
之间没有变化,我想将其标记为 0
.
这个问题与 , but the last part about TF
standing alone is new. Both Uwe and Psidom provided elegant answers to the initial question using data.table
and using tidyverse
here. 类似,我发布了我的问题的这个稍微修改过的版本。
If this question violates any SO policies please let me know and I'll be happy to reopen my initial question or append this an bounty-issue.
用一个最小工作示例 来说明我的问题。我有这样的数据,
我有什么,
# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)
tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
tbl %>% print(n=18)
#> # A tibble: 40 x 2
#> id TF
#> <int> <dbl>
#> 1 10 NA
#> 2 10 NA
#> 3 10 0
#> 4 10 NA
#> 5 10 0
#> 6 10 NA
#> 7 10 1
#> 8 10 1
#> 9 10 1
#> 10 10 1
#> 11 10 1
#> 12 10 NA
#> 13 10 1
#> 14 10 0
#> 15 10 1
#> 16 10 0
#> 17 10 1
#> 18 0 NA
#> # ... with 22 more rows
我想得到什么,
tblPM <- structure(list(id = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1,
NA, 1, 0, 1, 0, 1, NA, 0, NA, 0, 0, 1, 1, 1, 0, 0,
NA, NA, 0, NA, 0, 0, 0, 1, 1, 1, 0, NA, 1), PM01 = c(NA,
NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L,
-2L, -1L, 1L, 2L, 3L, NA, NA, NA), PM02 = c(NA, NA, NA, NA, 0L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -2L,
-1L, 1L, 2L, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L, -2L,
-1L, 1L, NA, NA), PM03 = c(NA, NA, NA, NA, NA, NA, 0L, 0L, 0L,
0L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L, -2L,
-1L, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0L), PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
-1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), PM05 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), PM06 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), PM07 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), .Names = c("id", "TF", "PM01", "PM02", "PM03", "PM04", "PM05",
"PM06", "PM07"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -40L
))
tblPM %>% print(n=18)
#> # A tibble: 40 x 9
#> id TF PM01 PM02 PM03 PM04 PM05 PM06 PM07
#> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
#> 1 10 NA NA NA NA NA NA NA NA
#> 2 10 NA NA NA NA NA NA NA NA
#> 3 10 0 0 NA NA NA NA NA NA
#> 4 10 NA NA NA NA NA NA NA NA
#> 5 10 0 NA 0 NA NA NA NA NA
#> 6 10 NA NA NA NA NA NA NA NA
#> 7 10 1 NA NA 0 NA NA NA NA
#> 8 10 1 NA NA 0 NA NA NA NA
#> 9 10 1 NA NA 0 NA NA NA NA
#> 10 10 1 NA NA 0 NA NA NA NA
#> 11 10 1 NA NA 0 NA NA NA NA
#> 12 10 NA NA NA NA NA NA NA NA
#> 13 10 1 NA NA NA -1 NA NA NA
#> 14 10 0 NA NA NA 1 -1 NA NA
#> 15 10 1 NA NA NA NA 1 -1 NA
#> 16 10 0 NA NA NA NA NA 1 -1
#> 17 10 1 NA NA NA NA NA NA 1
#> 18 0 NA NA NA NA NA NA NA NA
#> # ... with 22 more rows
identical([some solution], tblPM)
#> [1] TRUE
更新 microbenchmark
2018-01-24 14:20:18Z,
感谢 Fierr 和 Chris 花时间梳理逻辑并提交答案。启发了我的 this setup 我计算了他们函数的小型微基准比较。我放了 Fierrs answer into the function
tidyverse_Fierr()and Chris' answer into
dt_Chris()`(如果有人想要确切的功能,请告诉我,我会在这里添加它们。
经过一些小的调整后,它们与 tblPM
匹配时结果完全相同,即
identical(tblPM, tidyverse_Fierr(tbl))
#> [1] TRUE
identical(tblPM, dt_Chris(tbl))
#> [1] TRUE
现在进入快速微基准测试,
df_test <- bind_rows(rep(list(tbl), 111))
microbenchmark::microbenchmark(tidyverse_Fierr(df_test), dt_Chris(df_test), times = 3*1)
#> Unit: milliseconds
#> expr min mean median uq max neval cld
#> tidyverse_Fierr(df_test) 19503.366 20171.268 20080.99 20505.219 20929.4489 3 b
#> dt_Chris(df_test) 199.165 233.924 203.72 251.304 298.8887 3 a
有趣的是 tidy_method 在 kinda similar comparison 中的输出速度更快。
喜欢揭开这个逻辑的挑战。该方法基于 tidyverse。欢迎提出更多整理建议!
library(data.table)
library(purrr)
library(dplyr)
library(tibble)
tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
tbl <- mutate(tbl, rn = 1:n())
lookup_table <- tbl %>%
group_by(id) %>%
mutate(rl = rleid(TF)) %>%
group_by(id, rl, TF) %>%
summarise(n=n()) %>%
group_by(id) %>%
mutate(lag = lag(TF, order_by=id),
lead = lead(TF, order_by=id),
test = ifelse(is.na(lag) & is.na(lead), 1, 0)) %>%
select(id, rl, test)
tmp <- tbl %>%
group_by(id) %>%
mutate(rl = rleid(TF),
rl_nona = ifelse(is.na(TF), NA, rleid(rl)),
rl_nona = match(rl_nona, unique(na.omit(rl_nona)))) %>% # Re-indexing
left_join(lookup_table, by = c("id" = "id", "rl" = "rl")) %>%
mutate(TF_new = ifelse(test == 1, NA, TF),
rl_gap = ifelse(is.na(TF_new), NA, rleid(TF_new)),
rl_gap = match(rl_gap, unique(na.omit(rl_gap))), # Re-indexing
up_pos = ifelse(min(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap),
down_pos = ifelse(max(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap)) %>%
group_by(id, rl_gap) %>%
mutate(up = ifelse(is.na(up_pos), 0, seq_len(n())),
down = ifelse(is.na(down_pos), 0, -rev(seq_len(n())))) %>%
group_by(id) %>%
mutate(zero_pos = ifelse(test == 1 & rl_nona > max(rl_gap, na.rm = TRUE), rl_nona - 1, rl_nona)) # Correct placement of zeroes
up <- dcast(tmp, rn ~ rl_nona, value.var = 'up' , fill = 0)
down <- dcast(tmp, rn ~ rl_nona, value.var = 'down', fill = 0)
res <- (down[, 2:max(tmp$rl_nona, na.rm=TRUE)] + up[, 3:(max(tmp$rl_nona, na.rm=TRUE)+1)]) %>%
mutate_all(funs(replace(., which(.==0), NA))) %>%
bind_cols(rn = tmp$rn, test = tmp$test, zero_pos = tmp$zero_pos) %>%
right_join(tbl, by = "rn") %>%
mutate(`PM01` = ifelse(test == 1 & zero_pos == 1, 0, `1`)) %>%
mutate(`PM02` = ifelse(test == 1 & zero_pos == 2, 0, `2`)) %>%
mutate(`PM03` = ifelse(test == 1 & zero_pos == 3, 0, `3`)) %>%
mutate(`PM04` = ifelse(test == 1 & zero_pos == 4, 0, `4`)) %>%
mutate(`PM05` = ifelse(test == 1 & zero_pos == 5, 0, `5`)) %>%
mutate(`PM06` = ifelse(test == 1 & zero_pos == 6, 0, `6`)) %>%
mutate(`PM07` = ifelse(test == 1 & zero_pos == 7, 0, `7`)) %>%
select(id, TF, everything(), -rn, -test, -zero_pos, -c(1:7)) %>%
mutate_if(is.numeric, as.integer) %>%
as.tibble()
identical(tblPM, res)
这是一个脚本方法 - 考虑到每个案例的自定义处理量(TF = NA,uniqueN(TF)= 1,uniqueN(TF)= 2,我认为这可能比 a 更容易实施dplyr 链。应该相当快,因为它全部基于 data.table。愿意接受有关如何改进的建议!
这将随着所需 PM 列数的增加而自动扩展 - 正如我在下面评论的那样,我建议去掉列中的 0 前缀,因为在某些情况下您可能会达到 10^2。 .n 列会撞到 PM001。
library(data.table)
tbl3 <- data.table(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0L, NA, 0L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 0L, 1L, 0L, 1L, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
# create index to untimately join back to
tbl3[, row_idx := .I]
# all transformations on a replicated data.table
tbl3_tmp <- copy(tbl3)
# identify where the NA breaks occur - this splits each id into subgroups (id_group)
tbl3_tmp[, P_TF := shift(TF, 1, "lag", fill = NA), by = .(id)]
tbl3_tmp[, TF_break := is.na(TF) | is.na(P_TF)]
tbl3_tmp[, id_group := cumsum(TF_break), by = .(id)]
tbl3_tmp[, `:=`(TF_break = NULL, P_TF = NULL)] # above can be consolidated to one line which would make this line unneccesary - expanded for easier understanding
tbl3_tmp <- tbl3_tmp[!is.na(TF)] # NA rows can be safely ignored now - these will be all NA, and will be handled with the left join below
# find where subpatterns exist (runs of 0..1 or 1..0)
tbl3_tmp[, subpattern_break := TF != shift(TF, 1, "lag", fill = NA), by = .(id, id_group)]
tbl3_tmp[, subbreaks := sum(subpattern_break, na.rm = TRUE), by = .(id, id_group)] # if there are no breaks, we need to treat separately
# two cases: zero subbreaks and multiple subbreaks.
tbl3_zeros <- tbl3_tmp[subbreaks == 0]
tbl3_nonzeros <- tbl3_tmp[subbreaks > 0]
# for 1+ subbreaks, we need to double the rows - this allows us to easily create the PM_field both "forwards" and "backwards"
tbl3_nonzeros[is.na(subpattern_break), subpattern_break := TRUE]
tbl3_nonzeros[, subbreak_index := cumsum(subpattern_break), by = .(id, id_group)]
tbl3_nonzeros <- rbindlist(list(tbl3_nonzeros,tbl3_nonzeros), idcol = "base") # double the row
tbl3_nonzeros[base == 1 & subbreak_index %% 2 == 1, subbreak_index := subbreak_index + 1L] # round to nearest even
tbl3_nonzeros[base == 2 & subbreak_index %% 2 == 0, subbreak_index := subbreak_index + 1L] # round to nearest odd
# this creates an index when the subbreak starts - allows us to sequence PM properly
tbl3_nonzeros[,subbreak_start := min(row_idx), by = .(id, id_group, subbreak_index)]
# exclude the ends if there is only one unique TF value - might be able to get this to one line
tbl3_nonzeros[, TF_count := uniqueN(TF), by = .(id, id_group, subbreak_index)]
tbl3_nonzeros <- tbl3_nonzeros[TF_count > 1]
# create a 1..N column, subtract the index where the break occurs ,then add 1 to all 0+ values.
tbl3_nonzeros[,PM_field := 1:.N, by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[, PM_field := PM_field - PM_field[which(diff(TF)!=0)[1]+1], by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[PM_field >= 0, PM_field := PM_field + 1L] # base 1 after the break
# create subbreaks for zero groups
tbl3_zeros[,subbreak_start := min(row_idx), by = .(id, id_group)]
# bring zero and non zero case together
tbl3_zeros <- tbl3_zeros[, .(id, id_group, subbreak_start,row_idx = row_idx, PM_field = 0L)]
tbl3_nonzeros <- tbl3_nonzeros[,.(id, id_group, subbreak_start, row_idx, PM_field)]
tbl3_tmp <- rbindlist(list(tbl3_zeros, tbl3_nonzeros))
# Create header
tbl3_tmp <- tbl3_tmp[order(subbreak_start, PM_field)]
tbl3_tmp[, PM_header := paste0("PM0",cumsum(c(1,diff(subbreak_start)!=0)),sep = ""), by = .(id)] # I would remove 0 in PM0 here (kept for identical check)- inefficient to check if this will be 1, 2, 3 etc digits This could also be solved with; `paste0("PM", sprintf("%02d", cumsum(c(1, diff(subbreak_start) != 0))))`
# long to wide
tbl3_tmp <- dcast(tbl3_tmp, row_idx ~ PM_header, value.var = "PM_field", fun.aggregate = sum, fill = NA)
# merge back to initial dataframe
tblPM_frombase <- merge(tbl3, tbl3_tmp, by = "row_idx", all.x = TRUE)[, row_idx := NULL]
identical(tblPM, tblPM_frombase)
[1] TRUE
我正在寻找一种方法,在 id
组内,计算数据 datatbl
.
TF
值偏移的唯一出现次数
我想从 TF
在 1
和 0
或 o
和 1
之间变化时向前和向后计数。计数将存储在一个新变量 PM##
中,以便 PM##
保存 TF
中的每个唯一移位,包括加号和减号。下面的 MWE 导致了晚上 7 点的结果,但我的生产数据可以有 15 个或更多班次。如果 TF
值在 NA
之间没有变化,我想将其标记为 0
.
这个问题与 TF
standing alone is new. Both Uwe and Psidom provided elegant answers to the initial question using data.table
tidyverse
here.
If this question violates any SO policies please let me know and I'll be happy to reopen my initial question or append this an bounty-issue.
用一个最小工作示例 来说明我的问题。我有这样的数据,
我有什么,
# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)
tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
tbl %>% print(n=18)
#> # A tibble: 40 x 2
#> id TF
#> <int> <dbl>
#> 1 10 NA
#> 2 10 NA
#> 3 10 0
#> 4 10 NA
#> 5 10 0
#> 6 10 NA
#> 7 10 1
#> 8 10 1
#> 9 10 1
#> 10 10 1
#> 11 10 1
#> 12 10 NA
#> 13 10 1
#> 14 10 0
#> 15 10 1
#> 16 10 0
#> 17 10 1
#> 18 0 NA
#> # ... with 22 more rows
我想得到什么,
tblPM <- structure(list(id = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1,
NA, 1, 0, 1, 0, 1, NA, 0, NA, 0, 0, 1, 1, 1, 0, 0,
NA, NA, 0, NA, 0, 0, 0, 1, 1, 1, 0, NA, 1), PM01 = c(NA,
NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L,
-2L, -1L, 1L, 2L, 3L, NA, NA, NA), PM02 = c(NA, NA, NA, NA, 0L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -2L,
-1L, 1L, 2L, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L, -2L,
-1L, 1L, NA, NA), PM03 = c(NA, NA, NA, NA, NA, NA, 0L, 0L, 0L,
0L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L, -2L,
-1L, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0L), PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
-1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), PM05 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), PM06 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), PM07 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), .Names = c("id", "TF", "PM01", "PM02", "PM03", "PM04", "PM05",
"PM06", "PM07"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -40L
))
tblPM %>% print(n=18)
#> # A tibble: 40 x 9
#> id TF PM01 PM02 PM03 PM04 PM05 PM06 PM07
#> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
#> 1 10 NA NA NA NA NA NA NA NA
#> 2 10 NA NA NA NA NA NA NA NA
#> 3 10 0 0 NA NA NA NA NA NA
#> 4 10 NA NA NA NA NA NA NA NA
#> 5 10 0 NA 0 NA NA NA NA NA
#> 6 10 NA NA NA NA NA NA NA NA
#> 7 10 1 NA NA 0 NA NA NA NA
#> 8 10 1 NA NA 0 NA NA NA NA
#> 9 10 1 NA NA 0 NA NA NA NA
#> 10 10 1 NA NA 0 NA NA NA NA
#> 11 10 1 NA NA 0 NA NA NA NA
#> 12 10 NA NA NA NA NA NA NA NA
#> 13 10 1 NA NA NA -1 NA NA NA
#> 14 10 0 NA NA NA 1 -1 NA NA
#> 15 10 1 NA NA NA NA 1 -1 NA
#> 16 10 0 NA NA NA NA NA 1 -1
#> 17 10 1 NA NA NA NA NA NA 1
#> 18 0 NA NA NA NA NA NA NA NA
#> # ... with 22 more rows
identical([some solution], tblPM)
#> [1] TRUE
更新 microbenchmark
2018-01-24 14:20:18Z,
感谢 Fierr 和 Chris 花时间梳理逻辑并提交答案。启发了我的 this setup 我计算了他们函数的小型微基准比较。我放了 Fierrs answer into the function
tidyverse_Fierr()and Chris' answer into
dt_Chris()`(如果有人想要确切的功能,请告诉我,我会在这里添加它们。
经过一些小的调整后,它们与 tblPM
匹配时结果完全相同,即
identical(tblPM, tidyverse_Fierr(tbl))
#> [1] TRUE
identical(tblPM, dt_Chris(tbl))
#> [1] TRUE
现在进入快速微基准测试,
df_test <- bind_rows(rep(list(tbl), 111))
microbenchmark::microbenchmark(tidyverse_Fierr(df_test), dt_Chris(df_test), times = 3*1)
#> Unit: milliseconds
#> expr min mean median uq max neval cld
#> tidyverse_Fierr(df_test) 19503.366 20171.268 20080.99 20505.219 20929.4489 3 b
#> dt_Chris(df_test) 199.165 233.924 203.72 251.304 298.8887 3 a
有趣的是 tidy_method 在 kinda similar comparison 中的输出速度更快。
喜欢揭开这个逻辑的挑战。该方法基于 tidyverse。欢迎提出更多整理建议!
library(data.table)
library(purrr)
library(dplyr)
library(tibble)
tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
tbl <- mutate(tbl, rn = 1:n())
lookup_table <- tbl %>%
group_by(id) %>%
mutate(rl = rleid(TF)) %>%
group_by(id, rl, TF) %>%
summarise(n=n()) %>%
group_by(id) %>%
mutate(lag = lag(TF, order_by=id),
lead = lead(TF, order_by=id),
test = ifelse(is.na(lag) & is.na(lead), 1, 0)) %>%
select(id, rl, test)
tmp <- tbl %>%
group_by(id) %>%
mutate(rl = rleid(TF),
rl_nona = ifelse(is.na(TF), NA, rleid(rl)),
rl_nona = match(rl_nona, unique(na.omit(rl_nona)))) %>% # Re-indexing
left_join(lookup_table, by = c("id" = "id", "rl" = "rl")) %>%
mutate(TF_new = ifelse(test == 1, NA, TF),
rl_gap = ifelse(is.na(TF_new), NA, rleid(TF_new)),
rl_gap = match(rl_gap, unique(na.omit(rl_gap))), # Re-indexing
up_pos = ifelse(min(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap),
down_pos = ifelse(max(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap)) %>%
group_by(id, rl_gap) %>%
mutate(up = ifelse(is.na(up_pos), 0, seq_len(n())),
down = ifelse(is.na(down_pos), 0, -rev(seq_len(n())))) %>%
group_by(id) %>%
mutate(zero_pos = ifelse(test == 1 & rl_nona > max(rl_gap, na.rm = TRUE), rl_nona - 1, rl_nona)) # Correct placement of zeroes
up <- dcast(tmp, rn ~ rl_nona, value.var = 'up' , fill = 0)
down <- dcast(tmp, rn ~ rl_nona, value.var = 'down', fill = 0)
res <- (down[, 2:max(tmp$rl_nona, na.rm=TRUE)] + up[, 3:(max(tmp$rl_nona, na.rm=TRUE)+1)]) %>%
mutate_all(funs(replace(., which(.==0), NA))) %>%
bind_cols(rn = tmp$rn, test = tmp$test, zero_pos = tmp$zero_pos) %>%
right_join(tbl, by = "rn") %>%
mutate(`PM01` = ifelse(test == 1 & zero_pos == 1, 0, `1`)) %>%
mutate(`PM02` = ifelse(test == 1 & zero_pos == 2, 0, `2`)) %>%
mutate(`PM03` = ifelse(test == 1 & zero_pos == 3, 0, `3`)) %>%
mutate(`PM04` = ifelse(test == 1 & zero_pos == 4, 0, `4`)) %>%
mutate(`PM05` = ifelse(test == 1 & zero_pos == 5, 0, `5`)) %>%
mutate(`PM06` = ifelse(test == 1 & zero_pos == 6, 0, `6`)) %>%
mutate(`PM07` = ifelse(test == 1 & zero_pos == 7, 0, `7`)) %>%
select(id, TF, everything(), -rn, -test, -zero_pos, -c(1:7)) %>%
mutate_if(is.numeric, as.integer) %>%
as.tibble()
identical(tblPM, res)
这是一个脚本方法 - 考虑到每个案例的自定义处理量(TF = NA,uniqueN(TF)= 1,uniqueN(TF)= 2,我认为这可能比 a 更容易实施dplyr 链。应该相当快,因为它全部基于 data.table。愿意接受有关如何改进的建议!
这将随着所需 PM 列数的增加而自动扩展 - 正如我在下面评论的那样,我建议去掉列中的 0 前缀,因为在某些情况下您可能会达到 10^2。 .n 列会撞到 PM001。
library(data.table)
tbl3 <- data.table(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0L, NA, 0L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 0L, 1L, 0L, 1L, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
# create index to untimately join back to
tbl3[, row_idx := .I]
# all transformations on a replicated data.table
tbl3_tmp <- copy(tbl3)
# identify where the NA breaks occur - this splits each id into subgroups (id_group)
tbl3_tmp[, P_TF := shift(TF, 1, "lag", fill = NA), by = .(id)]
tbl3_tmp[, TF_break := is.na(TF) | is.na(P_TF)]
tbl3_tmp[, id_group := cumsum(TF_break), by = .(id)]
tbl3_tmp[, `:=`(TF_break = NULL, P_TF = NULL)] # above can be consolidated to one line which would make this line unneccesary - expanded for easier understanding
tbl3_tmp <- tbl3_tmp[!is.na(TF)] # NA rows can be safely ignored now - these will be all NA, and will be handled with the left join below
# find where subpatterns exist (runs of 0..1 or 1..0)
tbl3_tmp[, subpattern_break := TF != shift(TF, 1, "lag", fill = NA), by = .(id, id_group)]
tbl3_tmp[, subbreaks := sum(subpattern_break, na.rm = TRUE), by = .(id, id_group)] # if there are no breaks, we need to treat separately
# two cases: zero subbreaks and multiple subbreaks.
tbl3_zeros <- tbl3_tmp[subbreaks == 0]
tbl3_nonzeros <- tbl3_tmp[subbreaks > 0]
# for 1+ subbreaks, we need to double the rows - this allows us to easily create the PM_field both "forwards" and "backwards"
tbl3_nonzeros[is.na(subpattern_break), subpattern_break := TRUE]
tbl3_nonzeros[, subbreak_index := cumsum(subpattern_break), by = .(id, id_group)]
tbl3_nonzeros <- rbindlist(list(tbl3_nonzeros,tbl3_nonzeros), idcol = "base") # double the row
tbl3_nonzeros[base == 1 & subbreak_index %% 2 == 1, subbreak_index := subbreak_index + 1L] # round to nearest even
tbl3_nonzeros[base == 2 & subbreak_index %% 2 == 0, subbreak_index := subbreak_index + 1L] # round to nearest odd
# this creates an index when the subbreak starts - allows us to sequence PM properly
tbl3_nonzeros[,subbreak_start := min(row_idx), by = .(id, id_group, subbreak_index)]
# exclude the ends if there is only one unique TF value - might be able to get this to one line
tbl3_nonzeros[, TF_count := uniqueN(TF), by = .(id, id_group, subbreak_index)]
tbl3_nonzeros <- tbl3_nonzeros[TF_count > 1]
# create a 1..N column, subtract the index where the break occurs ,then add 1 to all 0+ values.
tbl3_nonzeros[,PM_field := 1:.N, by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[, PM_field := PM_field - PM_field[which(diff(TF)!=0)[1]+1], by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[PM_field >= 0, PM_field := PM_field + 1L] # base 1 after the break
# create subbreaks for zero groups
tbl3_zeros[,subbreak_start := min(row_idx), by = .(id, id_group)]
# bring zero and non zero case together
tbl3_zeros <- tbl3_zeros[, .(id, id_group, subbreak_start,row_idx = row_idx, PM_field = 0L)]
tbl3_nonzeros <- tbl3_nonzeros[,.(id, id_group, subbreak_start, row_idx, PM_field)]
tbl3_tmp <- rbindlist(list(tbl3_zeros, tbl3_nonzeros))
# Create header
tbl3_tmp <- tbl3_tmp[order(subbreak_start, PM_field)]
tbl3_tmp[, PM_header := paste0("PM0",cumsum(c(1,diff(subbreak_start)!=0)),sep = ""), by = .(id)] # I would remove 0 in PM0 here (kept for identical check)- inefficient to check if this will be 1, 2, 3 etc digits This could also be solved with; `paste0("PM", sprintf("%02d", cumsum(c(1, diff(subbreak_start) != 0))))`
# long to wide
tbl3_tmp <- dcast(tbl3_tmp, row_idx ~ PM_header, value.var = "PM_field", fun.aggregate = sum, fill = NA)
# merge back to initial dataframe
tblPM_frombase <- merge(tbl3, tbl3_tmp, by = "row_idx", all.x = TRUE)[, row_idx := NULL]
identical(tblPM, tblPM_frombase)
[1] TRUE