识别哪个代码属于哪个组

Identify which code belongs to which group

数据

df <- data.frame(group = c("a", "a", "a", "a", "b", "b", "b", "b", "b", "c", "c", "c", "c","c", "d", "d", "e", "e", "f", "f", "g", "g", "g","g"), code = c("G7", "G5", "G4", "K5", "H1", "H2", "K1", "H5", "J7", "K2", "HH", "K7", "JL","K5", "K7", "H5", "K5", "KO", "SS", "KK", "K2", "00", " 00", "J9" ))

code1 <- c("K2", "K1", "K5","K7")

code2 <- c("J2", "J1", "J5","J7","J9","JH","JP","JL","JJ","JL")

期望的输出

      组代码指示器
 1:G7 0
 2:G5 0
 3:G4 0
 4:一个K5 0
 5: b H1 0
 6: b H2 0
 7: b K1 1
 8: b H5 0
 9:b J7 0
10:c K2 1
11:c HH 0
12:c K7 1
13:c JL 0
14:c K5 0
15:d K7 0
16:d H5 0
17: e K5 0
18: e 击倒 0
19: f SS 0
20:f KK 0
21: 克 K2 1
22: 克 00 0
23: 克 00 0
24:克J9 0

我很想在 data.table 中找到解决方案,但其他实现也不错。我有一个大数据集。出于速度考虑,我宁愿避免循环。

这里有一个选项:

setDT(df)
df[, c("rn", "ind") := .(.I, 0L)]
df[code %chin% code1, ind := 
    df[code %chin% code2][.SD, on=.(group, rn>rn), mult="first", .N, by=.EACHI]$N
]

输出(注意 group=c 的第一行也应该是 1,因为后面有一个 JL)

    group code indicator rn ind
 1:     a   G7         0  1   0
 2:     a   G5         0  2   0
 3:     a   G4         0  3   0
 4:     a   K5         0  4   0
 5:     b   H1         0  5   0
 6:     b   H2         0  6   0
 7:     b   K1         1  7   1
 8:     b   H5         0  8   0
 9:     b   J7         0  9   0
10:     c   K2         0 10   1
11:     c   HH         0 11   0
12:     c   K7         1 12   1
13:     c   JL         0 13   0
14:     c   K5         0 14   0
15:     d   K7         0 15   0
16:     d   H5         0 16   0
17:     e   K5         0 17   0
18:     e   KO         0 18   0
19:     f   SS         0 19   0
20:     f   KK         0 20   0
21:     g   K2         1 21   1
22:     g   00         0 22   0
23:     g   00         0 23   0
24:     g   J9         0 24   0
    group code indicator rn ind

一个可能的data.table选项

setDT(df)[, indicator2 := nafill(
    as.integer(
        with(
            stack(list("1" = code1, "2" = code2)),
            ind[match(code, values)]
        )
    ),
    type = "nocb"
)][, indicator2 := nafill(c(diff(indicator2) == 1, 0),
    fill = 0
), group][]

给予

    group code indicator indicator2
 1:     a   G7         0          0
 2:     a   G5         0          0
 3:     a   G4         0          0
 4:     a   K5         0          0
 5:     b   H1         0          0
 6:     b   H2         0          0
 7:     b   K1         1          1
 8:     b   H5         0          0
 9:     b   J7         0          0
10:     c   K2         0          0
11:     c   HH         0          0
12:     c   K7         1          1
13:     c   JL         0          0
14:     c   K5         0          0
15:     d   K7         0          0
16:     d   H5         0          0
17:     e   K5         0          0
18:     e   KO         0          0
19:     f   SS         0          0
20:     f   KK         0          0
21:     g   K2         1          1
22:     g   00         0          0
23:     g   00         0          0
24:     g   J9         0          0

另一种方式是:

df[, ind := code %in%code1 + 2 * code%in%code2, group][,
   ind := ind * (length(unique(ind)) == 3)][,
    indicator := (d<-cummax(ind)) * (d != 2) * ind * (d != 2), group][, ind:=NULL]

df
    group code indicator
 1:     a   G7         0
 2:     a   G5         0
 3:     a   G4         0
 4:     a   K5         1
 5:     b   H1         0
 6:     b   H2         0
 7:     b   K1         1
 8:     b   H5         0
 9:     b   J7         0
10:     c   K2         1
11:     c   HH         0
12:     c   K7         1
13:     c   JL         0
14:     c   K5         0
15:     d   K7         1
16:     d   H5         0
17:     e   K5         1
18:     e   KO         0
19:     f   SS         0
20:     f   KK         0
21:     g   K2         1
22:     g   00         0
23:     g   00         0
24:     g   J9         0
    group code indicator

df %>%
  group_by(group) %>%
  mutate(ind = code %in%code1 + 2 * code%in%code2,
         ind = ind * (length(unique(ind)) == 3),
         indicator = (d<-cummax(ind))*(d!=2) * ind*(ind!=2),
         ind = NULL)