对行进行分组直到满足特定条件

Grouping the rows until meet certain conditions

我对将行组合在一起直到满足特定条件有疑问。这是我的数据框。

| Gene     | directon       |intergenic_distance|
| -------- | -------------- |-------------------|
| fixA     | 11             |NA                 |
| fixB     | 11             |15                 |
| fixC     | 11             |51                 |
| fixX     | 11             |-3                 |
| kefF     | 11             |108                |
| kefC     | 11             |-7                 |
| apaH     | 12             |NA                 |
| apaG     | 12             |7                  |

我想在 intergenic_distance>50 之后和同一方向内对行进行分组,如下所示。

| Gene     | directon       |intergenic_distance|operon|
| -------- | -------------- |-------------------|------|
| fixA     | 11             |NA                 |1     |
| fixB     | 11             |15                 |1     |
| fixC     | 11             |51                 |2     |
| fixX     | 11             |-3                 |2     |
| kefF     | 11             |108                |3     |
| kefC     | 11             |-7                 |3     |
| apaH     | 12             |NA                 |4     |
| apaG     | 12             |7                  |4     |

我正在考虑使用 with, rle, rep, seq_along 但我不知道该怎么做。提前致谢!

dput(head(e_coli_operon,10))
structure(list(name = c("thrA", "thrB", "thrC", "yaaW", "yaaI", 
"mokC", "hokC", "insB", "insA", "yaaY"), start = c(337, 2801, 
3734, 10643, 11382, 16751, 16751, 19811, 20233, 21181), end = c(2799, 
3733, 5020, 11356, 11786, 16960, 16903, 20314, 20508, 21399), 
    strand = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1), length = c(820L, 
    310L, 428L, 237L, 134L, 69L, 50L, 167L, 91L, 72L), pid = c(16127996L, 
    16127997L, 16127998L, 16128005L, 16128007L, 16128012L, 49175991L, 
    16128015L, 16128016L, 16128018L), gene = c("thrA", "thrB", 
    "thrC", "yaaW", "yaaI", "mokC", "hokC", "insB", "insA", "yaaY"
    ), synonym = c("b0002", "b0003", "b0004", "b0011", "b0013", 
    "b0018", "b4412", "b0021", "b0022", "b0024"), code = c("-", 
    "-", "-", "-", "-", "-", "-", "-", "-", "-"), cog = c("COG0527E", 
    "COG0083E", "COG0498E", "COG4735S", "-", "-", "-", "COG1662L", 
    "COG3677L", "-"), product = c("fused aspartokinase I and homoserine dehydrogenase I", 
    "homoserine kinase", "threonine synthase", "conserved protein, UPF0174 family", 
    "conserved protein, UPF0412 family", "regulatory protein for HokC, overlaps CDS of hokC", 
    "toxic membrane protein, small", "IS1 transposase B", "IS1 repressor TnpA", 
    "predicted protein"), col = c("blue", "blue", "blue", "blue", 
    "blue", "blue", "blue", "blue", "blue", "blue"), fill = c("blue", 
    "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", 
    "blue"), lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lwd = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), pch = c(8, 8, 8, 8, 8, 8, 8, 
    8, 8, 8), cex = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gene_type = c("arrows", 
    "arrows", "arrows", "arrows", "arrows", "arrows", "arrows", 
    "arrows", "arrows", "arrows"), directon = c("1", "1", "1", 
    "4", "4", "6", "6", "8", "8", "9"), intergenic_distance = c(82, 
    2, 1, 149, 26, NA, -209, NA, -81, NA)), row.names = c(NA, 
-10L), groups = structure(list(directon = c("1", "4", "6", "8", 
"9"), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 5L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

您可以在以下情况下将 operon 值递增 1:

  • intergenic_distance 大于 50 并且不是 NA
  • 当前 directon 值与之前的 directon 值不同。
library(dplyr)

df %>%
  mutate(operon = cumsum(intergenic_distance > 50 & !is.na(intergenic_distance)|
                       directon != lag(directon, default = first(directon))) + 1)

#  Gene directon intergenic_distance operon
#1 fixA       11                  NA      1
#2 fixB       11                  15      1
#3 fixC       11                  51      2
#4 fixX       11                  -3      2
#5 kefF       11                 108      3
#6 kefC       11                  -7      3
#7 apaH       12                  NA      4
#8 apaG       12                   7      4

数据

df <- structure(list(Gene = c("fixA", "fixB", "fixC", "fixX", "kefF", 
"kefC", "apaH", "apaG"), directon = c(11L, 11L, 11L, 11L, 11L, 
11L, 12L, 12L), intergenic_distance = c(NA, 15L, 51L, -3L, 108L, 
-7L, NA, 7L)), row.names = c(NA, -8L), class = "data.frame")