识别哪个代码属于哪个组
Identify which code belongs to which group
数据
df <- data.frame(group = c("a", "a", "a", "a", "b", "b", "b", "b", "b", "c", "c", "c", "c","c", "d", "d", "e", "e", "f", "f", "g", "g", "g","g"), code = c("G7", "G5", "G4", "K5", "H1", "H2", "K1", "H5", "J7", "K2", "HH", "K7", "JL","K5", "K7", "H5", "K5", "KO", "SS", "KK", "K2", "00", " 00", "J9" ))
code1 <- c("K2", "K1", "K5","K7")
code2 <- c("J2", "J1", "J5","J7","J9","JH","JP","JL","JJ","JL")
期望的输出
组代码指示器
1:G7 0
2:G5 0
3:G4 0
4:一个K5 0
5: b H1 0
6: b H2 0
7: b K1 1
8: b H5 0
9:b J7 0
10:c K2 1
11:c HH 0
12:c K7 1
13:c JL 0
14:c K5 0
15:d K7 0
16:d H5 0
17: e K5 0
18: e 击倒 0
19: f SS 0
20:f KK 0
21: 克 K2 1
22: 克 00 0
23: 克 00 0
24:克J9 0
我很想在 data.table 中找到解决方案,但其他实现也不错。我有一个大数据集。出于速度考虑,我宁愿避免循环。
这里有一个选项:
setDT(df)
df[, c("rn", "ind") := .(.I, 0L)]
df[code %chin% code1, ind :=
df[code %chin% code2][.SD, on=.(group, rn>rn), mult="first", .N, by=.EACHI]$N
]
输出(注意 group=c 的第一行也应该是 1,因为后面有一个 JL)
group code indicator rn ind
1: a G7 0 1 0
2: a G5 0 2 0
3: a G4 0 3 0
4: a K5 0 4 0
5: b H1 0 5 0
6: b H2 0 6 0
7: b K1 1 7 1
8: b H5 0 8 0
9: b J7 0 9 0
10: c K2 0 10 1
11: c HH 0 11 0
12: c K7 1 12 1
13: c JL 0 13 0
14: c K5 0 14 0
15: d K7 0 15 0
16: d H5 0 16 0
17: e K5 0 17 0
18: e KO 0 18 0
19: f SS 0 19 0
20: f KK 0 20 0
21: g K2 1 21 1
22: g 00 0 22 0
23: g 00 0 23 0
24: g J9 0 24 0
group code indicator rn ind
一个可能的data.table
选项
setDT(df)[, indicator2 := nafill(
as.integer(
with(
stack(list("1" = code1, "2" = code2)),
ind[match(code, values)]
)
),
type = "nocb"
)][, indicator2 := nafill(c(diff(indicator2) == 1, 0),
fill = 0
), group][]
给予
group code indicator indicator2
1: a G7 0 0
2: a G5 0 0
3: a G4 0 0
4: a K5 0 0
5: b H1 0 0
6: b H2 0 0
7: b K1 1 1
8: b H5 0 0
9: b J7 0 0
10: c K2 0 0
11: c HH 0 0
12: c K7 1 1
13: c JL 0 0
14: c K5 0 0
15: d K7 0 0
16: d H5 0 0
17: e K5 0 0
18: e KO 0 0
19: f SS 0 0
20: f KK 0 0
21: g K2 1 1
22: g 00 0 0
23: g 00 0 0
24: g J9 0 0
另一种方式是:
df[, ind := code %in%code1 + 2 * code%in%code2, group][,
ind := ind * (length(unique(ind)) == 3)][,
indicator := (d<-cummax(ind)) * (d != 2) * ind * (d != 2), group][, ind:=NULL]
df
group code indicator
1: a G7 0
2: a G5 0
3: a G4 0
4: a K5 1
5: b H1 0
6: b H2 0
7: b K1 1
8: b H5 0
9: b J7 0
10: c K2 1
11: c HH 0
12: c K7 1
13: c JL 0
14: c K5 0
15: d K7 1
16: d H5 0
17: e K5 1
18: e KO 0
19: f SS 0
20: f KK 0
21: g K2 1
22: g 00 0
23: g 00 0
24: g J9 0
group code indicator
或
df %>%
group_by(group) %>%
mutate(ind = code %in%code1 + 2 * code%in%code2,
ind = ind * (length(unique(ind)) == 3),
indicator = (d<-cummax(ind))*(d!=2) * ind*(ind!=2),
ind = NULL)
数据
df <- data.frame(group = c("a", "a", "a", "a", "b", "b", "b", "b", "b", "c", "c", "c", "c","c", "d", "d", "e", "e", "f", "f", "g", "g", "g","g"), code = c("G7", "G5", "G4", "K5", "H1", "H2", "K1", "H5", "J7", "K2", "HH", "K7", "JL","K5", "K7", "H5", "K5", "KO", "SS", "KK", "K2", "00", " 00", "J9" ))
code1 <- c("K2", "K1", "K5","K7")
code2 <- c("J2", "J1", "J5","J7","J9","JH","JP","JL","JJ","JL")
期望的输出
组代码指示器 1:G7 0 2:G5 0 3:G4 0 4:一个K5 0 5: b H1 0 6: b H2 0 7: b K1 1 8: b H5 0 9:b J7 0 10:c K2 1 11:c HH 0 12:c K7 1 13:c JL 0 14:c K5 0 15:d K7 0 16:d H5 0 17: e K5 0 18: e 击倒 0 19: f SS 0 20:f KK 0 21: 克 K2 1 22: 克 00 0 23: 克 00 0 24:克J9 0
我很想在 data.table 中找到解决方案,但其他实现也不错。我有一个大数据集。出于速度考虑,我宁愿避免循环。
这里有一个选项:
setDT(df)
df[, c("rn", "ind") := .(.I, 0L)]
df[code %chin% code1, ind :=
df[code %chin% code2][.SD, on=.(group, rn>rn), mult="first", .N, by=.EACHI]$N
]
输出(注意 group=c 的第一行也应该是 1,因为后面有一个 JL)
group code indicator rn ind
1: a G7 0 1 0
2: a G5 0 2 0
3: a G4 0 3 0
4: a K5 0 4 0
5: b H1 0 5 0
6: b H2 0 6 0
7: b K1 1 7 1
8: b H5 0 8 0
9: b J7 0 9 0
10: c K2 0 10 1
11: c HH 0 11 0
12: c K7 1 12 1
13: c JL 0 13 0
14: c K5 0 14 0
15: d K7 0 15 0
16: d H5 0 16 0
17: e K5 0 17 0
18: e KO 0 18 0
19: f SS 0 19 0
20: f KK 0 20 0
21: g K2 1 21 1
22: g 00 0 22 0
23: g 00 0 23 0
24: g J9 0 24 0
group code indicator rn ind
一个可能的data.table
选项
setDT(df)[, indicator2 := nafill(
as.integer(
with(
stack(list("1" = code1, "2" = code2)),
ind[match(code, values)]
)
),
type = "nocb"
)][, indicator2 := nafill(c(diff(indicator2) == 1, 0),
fill = 0
), group][]
给予
group code indicator indicator2
1: a G7 0 0
2: a G5 0 0
3: a G4 0 0
4: a K5 0 0
5: b H1 0 0
6: b H2 0 0
7: b K1 1 1
8: b H5 0 0
9: b J7 0 0
10: c K2 0 0
11: c HH 0 0
12: c K7 1 1
13: c JL 0 0
14: c K5 0 0
15: d K7 0 0
16: d H5 0 0
17: e K5 0 0
18: e KO 0 0
19: f SS 0 0
20: f KK 0 0
21: g K2 1 1
22: g 00 0 0
23: g 00 0 0
24: g J9 0 0
另一种方式是:
df[, ind := code %in%code1 + 2 * code%in%code2, group][,
ind := ind * (length(unique(ind)) == 3)][,
indicator := (d<-cummax(ind)) * (d != 2) * ind * (d != 2), group][, ind:=NULL]
df
group code indicator
1: a G7 0
2: a G5 0
3: a G4 0
4: a K5 1
5: b H1 0
6: b H2 0
7: b K1 1
8: b H5 0
9: b J7 0
10: c K2 1
11: c HH 0
12: c K7 1
13: c JL 0
14: c K5 0
15: d K7 1
16: d H5 0
17: e K5 1
18: e KO 0
19: f SS 0
20: f KK 0
21: g K2 1
22: g 00 0
23: g 00 0
24: g J9 0
group code indicator
或
df %>%
group_by(group) %>%
mutate(ind = code %in%code1 + 2 * code%in%code2,
ind = ind * (length(unique(ind)) == 3),
indicator = (d<-cummax(ind))*(d!=2) * ind*(ind!=2),
ind = NULL)