在组内计算值变化前后的数量,为每个独特的转变生成新变量
counting after and before change in value, within groups, generating new variables for each unique shift
我正在计算我的组中唯一值的出现次数,id
。我正在查看 TF
。当 TF
发生变化时,我想从该点向前和向后计数。此计数应存储在新变量 PM#
中,以便 PM#
为 TF
中的每个唯一移位 保存正负 。根据我收集到的信息,我需要使用 rle
,但我有点卡住了。
我做了这个工作示例来说明我的问题。
我有这个数据
df <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L,
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L)), .Names = c("id", "TF"), class = "data.frame", row.names = c(NA,
-30L))
这是我看到的数据
df[c(1:12,19:30),]
#> id TF
#> 1 0 NA
#> 2 0 0
#> 3 0 NA
#> 4 0 0
#> 5 0 0
#> 6 0 1
#> 7 0 1
#> 8 0 1
#> 9 0 NA
#> 10 0 0
#> 11 0 0
#> 12 1 NA
#> 19 1 NA
#> 20 7 NA
#> 21 7 0
#> 22 7 0
#> 23 7 1
#> 24 7 0
#> 25 7 0
#> 26 7 1
#> 27 7 0
#> 28 7 1
#> 29 7 1
#> 30 7 1
我已经开始使用 ave
、cumsum
和 rle
,但还没有这样解决。
df$PM01 <- with(df, ifelse(is.na(TF), NA, 1))
df$PM01 <- with(df, ave(PM01, TF, id, FUN=cumsum))
with(df, tapply(TF, rep(rle(id)[[2]], rle(id)[[1]]), count))
这就是我想要得到的,
dfa <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L,
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L), PM1 = c(NA, -3L, NA, -2L, -1L, 1L, 2L, 3L, NA,
NA, NA, NA, -3L, -2L, -1L, 1L, 2L, 3L, NA, NA, -2L, -1L, 1L,
NA, NA, NA, NA, NA, NA, NA), PM2 = c(NA, NA, NA, NA, NA, -3L,
-2L, -1L, NA, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, -1L, 1L, 2L, NA, NA, NA, NA, NA), PM3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, -2L, -1L, 1L, NA, NA, NA, NA), PM4 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA), PM5 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, 2L, 3L)), .Names = c("id",
"TF", "PM1", "PM2", "PM3", "PM4", "PM5"), class = "data.frame", row.names = c(NA,
-30L))
dfa[c(1:12,19:30),]
#> id TF PM1 PM2 PM3 PM4 PM5
#> 1 0 NA NA NA NA NA NA
#> 2 0 0 -3 NA NA NA NA
#> 3 0 NA NA NA NA NA NA
#> 4 0 0 -2 NA NA NA NA
#> 5 0 0 -1 NA NA NA NA
#> 6 0 1 1 -3 NA NA NA
#> 7 0 1 2 -2 NA NA NA
#> 8 0 1 3 -1 NA NA NA
#> 9 0 NA NA NA NA NA NA
#> 10 0 0 NA 1 NA NA NA
#> 11 0 0 NA 2 NA NA NA
#> 12 1 NA NA NA NA NA NA
#> 19 1 NA NA NA NA NA NA
#> 20 7 NA NA NA NA NA NA
#> 21 7 0 -2 NA NA NA NA
#> 22 7 0 -1 NA NA NA NA
#> 23 7 1 1 -1 NA NA NA
#> 24 7 0 NA 1 -2 NA NA
#> 25 7 0 NA 2 -1 NA NA
#> 26 7 1 NA NA 1 -1 NA
#> 27 7 0 NA NA NA 1 -1
#> 28 7 1 NA NA NA NA 1
#> 29 7 1 NA NA NA NA 2
#> 30 7 1 NA NA NA NA 3
这真是一个棘手的问题,我相信代码可以进一步改进。但是,我能够重现您的预期结果。请用您的生产数据尝试这种方法。如果可以,我会在后面补充说明。
library(data.table)
tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
res <- tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
rl == V1, PM := dn][rl == V1 + 1L, PM := up][
, dcast(.SD, id + TF + rn ~ paste0("PM", V1), value.var = "PM")][
df, on = .(rn, id, TF)][, -"rn"]
res
id TF PM1 PM2 PM3 PM4 PM5
1: 0 NA NA NA NA NA NA
2: 0 0 -3 NA NA NA NA
3: 0 NA NA NA NA NA NA
4: 0 0 -2 NA NA NA NA
5: 0 0 -1 NA NA NA NA
6: 0 1 1 -3 NA NA NA
7: 0 1 2 -2 NA NA NA
8: 0 1 3 -1 NA NA NA
9: 0 NA NA NA NA NA NA
10: 0 0 NA 1 NA NA NA
11: 0 0 NA 2 NA NA NA
12: 1 NA NA NA NA NA NA
13: 1 0 -3 NA NA NA NA
14: 1 0 -2 NA NA NA NA
15: 1 0 -1 NA NA NA NA
16: 1 1 1 NA NA NA NA
17: 1 1 2 NA NA NA NA
18: 1 1 3 NA NA NA NA
19: 1 NA NA NA NA NA NA
20: 7 NA NA NA NA NA NA
21: 7 0 -2 NA NA NA NA
22: 7 0 -1 NA NA NA NA
23: 7 1 1 -1 NA NA NA
24: 7 0 NA 1 -2 NA NA
25: 7 0 NA 2 -1 NA NA
26: 7 1 NA NA 1 -1 NA
27: 7 0 NA NA NA 1 -1
28: 7 1 NA NA NA NA 1
29: 7 1 NA NA NA NA 2
30: 7 1 NA NA NA NA 3
id TF PM1 PM2 PM3 PM4 PM5
# verify results are identical
identical(res, dfa)
[1] TRUE
如果每组更改超过 9 个,则应在对 dcast()
的调用中将 paste0("PM", V1)
替换为 sprintf("PM%02d",V1)
,以确保 PM
列的顺序正确。
说明
tmp <-
# coerce to data.table
setDT(df)[
# create row id column (required for final join to get NA rows back in)
, rn := .I][
# ignore NA rows
!is.na(TF)][
# number streaks of unique values within each group
, rl := rleid(TF), by = id][
# create ascending and descending counts for each streak
# this is done once to avoid repeatedly creation of counts for each PM
# (slight performance gain)
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)]
tmp[]
id TF rn rl up dn
1: 0 0 2 1 1 -3
2: 0 0 4 1 2 -2
3: 0 0 5 1 3 -1
4: 0 1 6 2 1 -3
5: 0 1 7 2 2 -2
6: 0 1 8 2 3 -1
7: 0 0 10 3 1 -2
8: 0 0 11 3 2 -1
9: 1 0 13 1 1 -3
10: 1 0 14 1 2 -2
11: 1 0 15 1 3 -1
12: 1 1 16 2 1 -3
13: 1 1 17 2 2 -2
14: 1 1 18 2 3 -1
15: 7 0 21 1 1 -2
16: 7 0 22 1 2 -1
17: 7 1 23 2 1 -1
18: 7 0 24 3 1 -2
19: 7 0 25 3 2 -1
20: 7 1 26 4 1 -1
21: 7 0 27 5 1 -1
22: 7 1 28 6 1 -3
23: 7 1 29 6 2 -2
24: 7 1 30 6 3 -1
id TF rn rl up dn
对于下一步,我们需要每个组V1
中的变化计数
tmp[, seq_len(max(rl) - 1L), by = .(id)]
id V1
1: 0 1
2: 0 2
3: 1 1
4: 7 1
5: 7 2
6: 7 3
7: 7 4
8: 7 5
现在,我们为每个组的行创建所有可能更改的“笛卡尔连接”:
# right join with count of changes within each group
tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
# copy descending counts to rows before the switch
rl == V1, PM := dn][
# copy ascending counts to rows after the switch
rl == V1 + 1L, PM := up][]
id TF rn rl up dn V1 PM
1: 0 0 2 1 1 -3 1 -3
2: 0 0 4 1 2 -2 1 -2
3: 0 0 5 1 3 -1 1 -1
4: 0 1 6 2 1 -3 1 1
5: 0 1 7 2 2 -2 1 2
6: 0 1 8 2 3 -1 1 3
7: 0 0 10 3 1 -2 1 NA
8: 0 0 11 3 2 -1 1 NA
9: 0 0 2 1 1 -3 2 NA
10: 0 0 4 1 2 -2 2 NA
11: 0 0 5 1 3 -1 2 NA
12: 0 1 6 2 1 -3 2 -3
13: 0 1 7 2 2 -2 2 -2
14: 0 1 8 2 3 -1 2 -1
15: 0 0 10 3 1 -2 2 1
16: 0 0 11 3 2 -1 2 2
17: 1 0 13 1 1 -3 1 -3
18: 1 0 14 1 2 -2 1 -2
19: 1 0 15 1 3 -1 1 -1
20: 1 1 16 2 1 -3 1 1
21: 1 1 17 2 2 -2 1 2
22: 1 1 18 2 3 -1 1 3
23: 7 0 21 1 1 -2 1 -2
24: 7 0 22 1 2 -1 1 -1
25: 7 1 23 2 1 -1 1 1
26: 7 0 24 3 1 -2 1 NA
27: 7 0 25 3 2 -1 1 NA
28: 7 1 26 4 1 -1 1 NA
29: 7 0 27 5 1 -1 1 NA
30: 7 1 28 6 1 -3 1 NA
31: 7 1 29 6 2 -2 1 NA
32: 7 1 30 6 3 -1 1 NA
33: 7 0 21 1 1 -2 2 NA
34: 7 0 22 1 2 -1 2 NA
35: 7 1 23 2 1 -1 2 -1
36: 7 0 24 3 1 -2 2 1
37: 7 0 25 3 2 -1 2 2
38: 7 1 26 4 1 -1 2 NA
39: 7 0 27 5 1 -1 2 NA
40: 7 1 28 6 1 -3 2 NA
41: 7 1 29 6 2 -2 2 NA
42: 7 1 30 6 3 -1 2 NA
43: 7 0 21 1 1 -2 3 NA
44: 7 0 22 1 2 -1 3 NA
45: 7 1 23 2 1 -1 3 NA
46: 7 0 24 3 1 -2 3 -2
47: 7 0 25 3 2 -1 3 -1
48: 7 1 26 4 1 -1 3 1
49: 7 0 27 5 1 -1 3 NA
50: 7 1 28 6 1 -3 3 NA
51: 7 1 29 6 2 -2 3 NA
52: 7 1 30 6 3 -1 3 NA
53: 7 0 21 1 1 -2 4 NA
54: 7 0 22 1 2 -1 4 NA
55: 7 1 23 2 1 -1 4 NA
56: 7 0 24 3 1 -2 4 NA
57: 7 0 25 3 2 -1 4 NA
58: 7 1 26 4 1 -1 4 -1
59: 7 0 27 5 1 -1 4 1
60: 7 1 28 6 1 -3 4 NA
61: 7 1 29 6 2 -2 4 NA
62: 7 1 30 6 3 -1 4 NA
63: 7 0 21 1 1 -2 5 NA
64: 7 0 22 1 2 -1 5 NA
65: 7 1 23 2 1 -1 5 NA
66: 7 0 24 3 1 -2 5 NA
67: 7 0 25 3 2 -1 5 NA
68: 7 1 26 4 1 -1 5 NA
69: 7 0 27 5 1 -1 5 -1
70: 7 1 28 6 1 -3 5 1
71: 7 1 29 6 2 -2 5 2
72: 7 1 30 6 3 -1 5 3
id TF rn rl up dn V1 PM
最后,中间结果从长格式重塑为宽格式。
res <-
# create a "cartesian join" of all possible changes with the rows of each group
tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
# copy descending counts to rows before the switch
rl == V1, PM := dn][
# copy ascending counts to rows after the switch
rl == V1 + 1L, PM := up][
# reshape from wide to long with the change count as new columns
, dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
# join with original df to get NA rows back in
df, on = .(rn, id, TF)][
# omit helper column
, -"rn"]
我认为笛卡尔连接是不必要的:
library(data.table)
tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, `:=`(up = 1:.N, down = -.N:-1), by = .(id, rl)][
, `:=`(last = (rl == max(rl)) * (-down)), by = id]
up = dcast(tmp, rn ~ rl, value.var = 'up' , fill = 0)
down = dcast(tmp, rn ~ rl, value.var = 'down', fill = 0)
last = dcast(tmp, rn ~ rl, value.var = 'last', fill = 0)
rl.max = tmp[, max(rl)]
res = down[, 2:rl.max] + up[, 3:(rl.max+1)] + last[, 2:rl.max]
res[res == 0] = NA
res[, rn := up$rn]
setcolorder(res[df, on='rn'][,-'rn'], c('id','TF', 1:(rl.max-1)))[]
# id TF 1 2 3 4 5
# 1: 0 NA NA NA NA NA NA
# 2: 0 0 -3 NA NA NA NA
# 3: 0 NA NA NA NA NA NA
# 4: 0 0 -2 NA NA NA NA
# 5: 0 0 -1 NA NA NA NA
# 6: 0 1 1 -3 NA NA NA
# 7: 0 1 2 -2 NA NA NA
# 8: 0 1 3 -1 NA NA NA
# 9: 0 NA NA NA NA NA NA
#10: 0 0 NA 1 NA NA NA
#11: 0 0 NA 2 NA NA NA
#12: 1 NA NA NA NA NA NA
#13: 1 0 -3 NA NA NA NA
#14: 1 0 -2 NA NA NA NA
#15: 1 0 -1 NA NA NA NA
#16: 1 1 1 NA NA NA NA
#17: 1 1 2 NA NA NA NA
#18: 1 1 3 NA NA NA NA
#19: 1 NA NA NA NA NA NA
#20: 7 NA NA NA NA NA NA
#21: 7 0 -2 NA NA NA NA
#22: 7 0 -1 NA NA NA NA
#23: 7 1 1 -1 NA NA NA
#24: 7 0 NA 1 -2 NA NA
#25: 7 0 NA 2 -1 NA NA
#26: 7 1 NA NA 1 -1 NA
#27: 7 0 NA NA NA 1 -1
#28: 7 1 NA NA NA NA 1
#29: 7 1 NA NA NA NA 2
#30: 7 1 NA NA NA NA 3
# id TF 1 2 3 4 5
我正在计算我的组中唯一值的出现次数,id
。我正在查看 TF
。当 TF
发生变化时,我想从该点向前和向后计数。此计数应存储在新变量 PM#
中,以便 PM#
为 TF
中的每个唯一移位 保存正负 。根据我收集到的信息,我需要使用 rle
,但我有点卡住了。
我做了这个工作示例来说明我的问题。
我有这个数据
df <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L,
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L)), .Names = c("id", "TF"), class = "data.frame", row.names = c(NA,
-30L))
这是我看到的数据
df[c(1:12,19:30),]
#> id TF
#> 1 0 NA
#> 2 0 0
#> 3 0 NA
#> 4 0 0
#> 5 0 0
#> 6 0 1
#> 7 0 1
#> 8 0 1
#> 9 0 NA
#> 10 0 0
#> 11 0 0
#> 12 1 NA
#> 19 1 NA
#> 20 7 NA
#> 21 7 0
#> 22 7 0
#> 23 7 1
#> 24 7 0
#> 25 7 0
#> 26 7 1
#> 27 7 0
#> 28 7 1
#> 29 7 1
#> 30 7 1
我已经开始使用 ave
、cumsum
和 rle
,但还没有这样解决。
df$PM01 <- with(df, ifelse(is.na(TF), NA, 1))
df$PM01 <- with(df, ave(PM01, TF, id, FUN=cumsum))
with(df, tapply(TF, rep(rle(id)[[2]], rle(id)[[1]]), count))
这就是我想要得到的,
dfa <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L,
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L), PM1 = c(NA, -3L, NA, -2L, -1L, 1L, 2L, 3L, NA,
NA, NA, NA, -3L, -2L, -1L, 1L, 2L, 3L, NA, NA, -2L, -1L, 1L,
NA, NA, NA, NA, NA, NA, NA), PM2 = c(NA, NA, NA, NA, NA, -3L,
-2L, -1L, NA, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, -1L, 1L, 2L, NA, NA, NA, NA, NA), PM3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, -2L, -1L, 1L, NA, NA, NA, NA), PM4 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA), PM5 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, 2L, 3L)), .Names = c("id",
"TF", "PM1", "PM2", "PM3", "PM4", "PM5"), class = "data.frame", row.names = c(NA,
-30L))
dfa[c(1:12,19:30),]
#> id TF PM1 PM2 PM3 PM4 PM5
#> 1 0 NA NA NA NA NA NA
#> 2 0 0 -3 NA NA NA NA
#> 3 0 NA NA NA NA NA NA
#> 4 0 0 -2 NA NA NA NA
#> 5 0 0 -1 NA NA NA NA
#> 6 0 1 1 -3 NA NA NA
#> 7 0 1 2 -2 NA NA NA
#> 8 0 1 3 -1 NA NA NA
#> 9 0 NA NA NA NA NA NA
#> 10 0 0 NA 1 NA NA NA
#> 11 0 0 NA 2 NA NA NA
#> 12 1 NA NA NA NA NA NA
#> 19 1 NA NA NA NA NA NA
#> 20 7 NA NA NA NA NA NA
#> 21 7 0 -2 NA NA NA NA
#> 22 7 0 -1 NA NA NA NA
#> 23 7 1 1 -1 NA NA NA
#> 24 7 0 NA 1 -2 NA NA
#> 25 7 0 NA 2 -1 NA NA
#> 26 7 1 NA NA 1 -1 NA
#> 27 7 0 NA NA NA 1 -1
#> 28 7 1 NA NA NA NA 1
#> 29 7 1 NA NA NA NA 2
#> 30 7 1 NA NA NA NA 3
这真是一个棘手的问题,我相信代码可以进一步改进。但是,我能够重现您的预期结果。请用您的生产数据尝试这种方法。如果可以,我会在后面补充说明。
library(data.table)
tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
res <- tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
rl == V1, PM := dn][rl == V1 + 1L, PM := up][
, dcast(.SD, id + TF + rn ~ paste0("PM", V1), value.var = "PM")][
df, on = .(rn, id, TF)][, -"rn"]
res
id TF PM1 PM2 PM3 PM4 PM5 1: 0 NA NA NA NA NA NA 2: 0 0 -3 NA NA NA NA 3: 0 NA NA NA NA NA NA 4: 0 0 -2 NA NA NA NA 5: 0 0 -1 NA NA NA NA 6: 0 1 1 -3 NA NA NA 7: 0 1 2 -2 NA NA NA 8: 0 1 3 -1 NA NA NA 9: 0 NA NA NA NA NA NA 10: 0 0 NA 1 NA NA NA 11: 0 0 NA 2 NA NA NA 12: 1 NA NA NA NA NA NA 13: 1 0 -3 NA NA NA NA 14: 1 0 -2 NA NA NA NA 15: 1 0 -1 NA NA NA NA 16: 1 1 1 NA NA NA NA 17: 1 1 2 NA NA NA NA 18: 1 1 3 NA NA NA NA 19: 1 NA NA NA NA NA NA 20: 7 NA NA NA NA NA NA 21: 7 0 -2 NA NA NA NA 22: 7 0 -1 NA NA NA NA 23: 7 1 1 -1 NA NA NA 24: 7 0 NA 1 -2 NA NA 25: 7 0 NA 2 -1 NA NA 26: 7 1 NA NA 1 -1 NA 27: 7 0 NA NA NA 1 -1 28: 7 1 NA NA NA NA 1 29: 7 1 NA NA NA NA 2 30: 7 1 NA NA NA NA 3 id TF PM1 PM2 PM3 PM4 PM5
# verify results are identical
identical(res, dfa)
[1] TRUE
如果每组更改超过 9 个,则应在对 dcast()
的调用中将 paste0("PM", V1)
替换为 sprintf("PM%02d",V1)
,以确保 PM
列的顺序正确。
说明
tmp <-
# coerce to data.table
setDT(df)[
# create row id column (required for final join to get NA rows back in)
, rn := .I][
# ignore NA rows
!is.na(TF)][
# number streaks of unique values within each group
, rl := rleid(TF), by = id][
# create ascending and descending counts for each streak
# this is done once to avoid repeatedly creation of counts for each PM
# (slight performance gain)
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)]
tmp[]
id TF rn rl up dn 1: 0 0 2 1 1 -3 2: 0 0 4 1 2 -2 3: 0 0 5 1 3 -1 4: 0 1 6 2 1 -3 5: 0 1 7 2 2 -2 6: 0 1 8 2 3 -1 7: 0 0 10 3 1 -2 8: 0 0 11 3 2 -1 9: 1 0 13 1 1 -3 10: 1 0 14 1 2 -2 11: 1 0 15 1 3 -1 12: 1 1 16 2 1 -3 13: 1 1 17 2 2 -2 14: 1 1 18 2 3 -1 15: 7 0 21 1 1 -2 16: 7 0 22 1 2 -1 17: 7 1 23 2 1 -1 18: 7 0 24 3 1 -2 19: 7 0 25 3 2 -1 20: 7 1 26 4 1 -1 21: 7 0 27 5 1 -1 22: 7 1 28 6 1 -3 23: 7 1 29 6 2 -2 24: 7 1 30 6 3 -1 id TF rn rl up dn
对于下一步,我们需要每个组V1
中的变化计数
tmp[, seq_len(max(rl) - 1L), by = .(id)]
id V1 1: 0 1 2: 0 2 3: 1 1 4: 7 1 5: 7 2 6: 7 3 7: 7 4 8: 7 5
现在,我们为每个组的行创建所有可能更改的“笛卡尔连接”:
# right join with count of changes within each group
tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
# copy descending counts to rows before the switch
rl == V1, PM := dn][
# copy ascending counts to rows after the switch
rl == V1 + 1L, PM := up][]
id TF rn rl up dn V1 PM 1: 0 0 2 1 1 -3 1 -3 2: 0 0 4 1 2 -2 1 -2 3: 0 0 5 1 3 -1 1 -1 4: 0 1 6 2 1 -3 1 1 5: 0 1 7 2 2 -2 1 2 6: 0 1 8 2 3 -1 1 3 7: 0 0 10 3 1 -2 1 NA 8: 0 0 11 3 2 -1 1 NA 9: 0 0 2 1 1 -3 2 NA 10: 0 0 4 1 2 -2 2 NA 11: 0 0 5 1 3 -1 2 NA 12: 0 1 6 2 1 -3 2 -3 13: 0 1 7 2 2 -2 2 -2 14: 0 1 8 2 3 -1 2 -1 15: 0 0 10 3 1 -2 2 1 16: 0 0 11 3 2 -1 2 2 17: 1 0 13 1 1 -3 1 -3 18: 1 0 14 1 2 -2 1 -2 19: 1 0 15 1 3 -1 1 -1 20: 1 1 16 2 1 -3 1 1 21: 1 1 17 2 2 -2 1 2 22: 1 1 18 2 3 -1 1 3 23: 7 0 21 1 1 -2 1 -2 24: 7 0 22 1 2 -1 1 -1 25: 7 1 23 2 1 -1 1 1 26: 7 0 24 3 1 -2 1 NA 27: 7 0 25 3 2 -1 1 NA 28: 7 1 26 4 1 -1 1 NA 29: 7 0 27 5 1 -1 1 NA 30: 7 1 28 6 1 -3 1 NA 31: 7 1 29 6 2 -2 1 NA 32: 7 1 30 6 3 -1 1 NA 33: 7 0 21 1 1 -2 2 NA 34: 7 0 22 1 2 -1 2 NA 35: 7 1 23 2 1 -1 2 -1 36: 7 0 24 3 1 -2 2 1 37: 7 0 25 3 2 -1 2 2 38: 7 1 26 4 1 -1 2 NA 39: 7 0 27 5 1 -1 2 NA 40: 7 1 28 6 1 -3 2 NA 41: 7 1 29 6 2 -2 2 NA 42: 7 1 30 6 3 -1 2 NA 43: 7 0 21 1 1 -2 3 NA 44: 7 0 22 1 2 -1 3 NA 45: 7 1 23 2 1 -1 3 NA 46: 7 0 24 3 1 -2 3 -2 47: 7 0 25 3 2 -1 3 -1 48: 7 1 26 4 1 -1 3 1 49: 7 0 27 5 1 -1 3 NA 50: 7 1 28 6 1 -3 3 NA 51: 7 1 29 6 2 -2 3 NA 52: 7 1 30 6 3 -1 3 NA 53: 7 0 21 1 1 -2 4 NA 54: 7 0 22 1 2 -1 4 NA 55: 7 1 23 2 1 -1 4 NA 56: 7 0 24 3 1 -2 4 NA 57: 7 0 25 3 2 -1 4 NA 58: 7 1 26 4 1 -1 4 -1 59: 7 0 27 5 1 -1 4 1 60: 7 1 28 6 1 -3 4 NA 61: 7 1 29 6 2 -2 4 NA 62: 7 1 30 6 3 -1 4 NA 63: 7 0 21 1 1 -2 5 NA 64: 7 0 22 1 2 -1 5 NA 65: 7 1 23 2 1 -1 5 NA 66: 7 0 24 3 1 -2 5 NA 67: 7 0 25 3 2 -1 5 NA 68: 7 1 26 4 1 -1 5 NA 69: 7 0 27 5 1 -1 5 -1 70: 7 1 28 6 1 -3 5 1 71: 7 1 29 6 2 -2 5 2 72: 7 1 30 6 3 -1 5 3 id TF rn rl up dn V1 PM
最后,中间结果从长格式重塑为宽格式。
res <-
# create a "cartesian join" of all possible changes with the rows of each group
tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
# copy descending counts to rows before the switch
rl == V1, PM := dn][
# copy ascending counts to rows after the switch
rl == V1 + 1L, PM := up][
# reshape from wide to long with the change count as new columns
, dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
# join with original df to get NA rows back in
df, on = .(rn, id, TF)][
# omit helper column
, -"rn"]
我认为笛卡尔连接是不必要的:
library(data.table)
tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, `:=`(up = 1:.N, down = -.N:-1), by = .(id, rl)][
, `:=`(last = (rl == max(rl)) * (-down)), by = id]
up = dcast(tmp, rn ~ rl, value.var = 'up' , fill = 0)
down = dcast(tmp, rn ~ rl, value.var = 'down', fill = 0)
last = dcast(tmp, rn ~ rl, value.var = 'last', fill = 0)
rl.max = tmp[, max(rl)]
res = down[, 2:rl.max] + up[, 3:(rl.max+1)] + last[, 2:rl.max]
res[res == 0] = NA
res[, rn := up$rn]
setcolorder(res[df, on='rn'][,-'rn'], c('id','TF', 1:(rl.max-1)))[]
# id TF 1 2 3 4 5
# 1: 0 NA NA NA NA NA NA
# 2: 0 0 -3 NA NA NA NA
# 3: 0 NA NA NA NA NA NA
# 4: 0 0 -2 NA NA NA NA
# 5: 0 0 -1 NA NA NA NA
# 6: 0 1 1 -3 NA NA NA
# 7: 0 1 2 -2 NA NA NA
# 8: 0 1 3 -1 NA NA NA
# 9: 0 NA NA NA NA NA NA
#10: 0 0 NA 1 NA NA NA
#11: 0 0 NA 2 NA NA NA
#12: 1 NA NA NA NA NA NA
#13: 1 0 -3 NA NA NA NA
#14: 1 0 -2 NA NA NA NA
#15: 1 0 -1 NA NA NA NA
#16: 1 1 1 NA NA NA NA
#17: 1 1 2 NA NA NA NA
#18: 1 1 3 NA NA NA NA
#19: 1 NA NA NA NA NA NA
#20: 7 NA NA NA NA NA NA
#21: 7 0 -2 NA NA NA NA
#22: 7 0 -1 NA NA NA NA
#23: 7 1 1 -1 NA NA NA
#24: 7 0 NA 1 -2 NA NA
#25: 7 0 NA 2 -1 NA NA
#26: 7 1 NA NA 1 -1 NA
#27: 7 0 NA NA NA 1 -1
#28: 7 1 NA NA NA NA 1
#29: 7 1 NA NA NA NA 2
#30: 7 1 NA NA NA NA 3
# id TF 1 2 3 4 5