对行进行分组直到满足特定条件
Grouping the rows until meet certain conditions
我对将行组合在一起直到满足特定条件有疑问。这是我的数据框。
| Gene | directon |intergenic_distance|
| -------- | -------------- |-------------------|
| fixA | 11 |NA |
| fixB | 11 |15 |
| fixC | 11 |51 |
| fixX | 11 |-3 |
| kefF | 11 |108 |
| kefC | 11 |-7 |
| apaH | 12 |NA |
| apaG | 12 |7 |
我想在 intergenic_distance>50 之后和同一方向内对行进行分组,如下所示。
| Gene | directon |intergenic_distance|operon|
| -------- | -------------- |-------------------|------|
| fixA | 11 |NA |1 |
| fixB | 11 |15 |1 |
| fixC | 11 |51 |2 |
| fixX | 11 |-3 |2 |
| kefF | 11 |108 |3 |
| kefC | 11 |-7 |3 |
| apaH | 12 |NA |4 |
| apaG | 12 |7 |4 |
我正在考虑使用 with, rle, rep, seq_along 但我不知道该怎么做。提前致谢!
dput(head(e_coli_operon,10))
structure(list(name = c("thrA", "thrB", "thrC", "yaaW", "yaaI",
"mokC", "hokC", "insB", "insA", "yaaY"), start = c(337, 2801,
3734, 10643, 11382, 16751, 16751, 19811, 20233, 21181), end = c(2799,
3733, 5020, 11356, 11786, 16960, 16903, 20314, 20508, 21399),
strand = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1), length = c(820L,
310L, 428L, 237L, 134L, 69L, 50L, 167L, 91L, 72L), pid = c(16127996L,
16127997L, 16127998L, 16128005L, 16128007L, 16128012L, 49175991L,
16128015L, 16128016L, 16128018L), gene = c("thrA", "thrB",
"thrC", "yaaW", "yaaI", "mokC", "hokC", "insB", "insA", "yaaY"
), synonym = c("b0002", "b0003", "b0004", "b0011", "b0013",
"b0018", "b4412", "b0021", "b0022", "b0024"), code = c("-",
"-", "-", "-", "-", "-", "-", "-", "-", "-"), cog = c("COG0527E",
"COG0083E", "COG0498E", "COG4735S", "-", "-", "-", "COG1662L",
"COG3677L", "-"), product = c("fused aspartokinase I and homoserine dehydrogenase I",
"homoserine kinase", "threonine synthase", "conserved protein, UPF0174 family",
"conserved protein, UPF0412 family", "regulatory protein for HokC, overlaps CDS of hokC",
"toxic membrane protein, small", "IS1 transposase B", "IS1 repressor TnpA",
"predicted protein"), col = c("blue", "blue", "blue", "blue",
"blue", "blue", "blue", "blue", "blue", "blue"), fill = c("blue",
"blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue",
"blue"), lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lwd = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), pch = c(8, 8, 8, 8, 8, 8, 8,
8, 8, 8), cex = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gene_type = c("arrows",
"arrows", "arrows", "arrows", "arrows", "arrows", "arrows",
"arrows", "arrows", "arrows"), directon = c("1", "1", "1",
"4", "4", "6", "6", "8", "8", "9"), intergenic_distance = c(82,
2, 1, 149, 26, NA, -209, NA, -81, NA)), row.names = c(NA,
-10L), groups = structure(list(directon = c("1", "4", "6", "8",
"9"), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
您可以在以下情况下将 operon
值递增 1:
intergenic_distance
大于 50 并且不是 NA
或
- 当前
directon
值与之前的 directon
值不同。
library(dplyr)
df %>%
mutate(operon = cumsum(intergenic_distance > 50 & !is.na(intergenic_distance)|
directon != lag(directon, default = first(directon))) + 1)
# Gene directon intergenic_distance operon
#1 fixA 11 NA 1
#2 fixB 11 15 1
#3 fixC 11 51 2
#4 fixX 11 -3 2
#5 kefF 11 108 3
#6 kefC 11 -7 3
#7 apaH 12 NA 4
#8 apaG 12 7 4
数据
df <- structure(list(Gene = c("fixA", "fixB", "fixC", "fixX", "kefF",
"kefC", "apaH", "apaG"), directon = c(11L, 11L, 11L, 11L, 11L,
11L, 12L, 12L), intergenic_distance = c(NA, 15L, 51L, -3L, 108L,
-7L, NA, 7L)), row.names = c(NA, -8L), class = "data.frame")
我对将行组合在一起直到满足特定条件有疑问。这是我的数据框。
| Gene | directon |intergenic_distance|
| -------- | -------------- |-------------------|
| fixA | 11 |NA |
| fixB | 11 |15 |
| fixC | 11 |51 |
| fixX | 11 |-3 |
| kefF | 11 |108 |
| kefC | 11 |-7 |
| apaH | 12 |NA |
| apaG | 12 |7 |
我想在 intergenic_distance>50 之后和同一方向内对行进行分组,如下所示。
| Gene | directon |intergenic_distance|operon|
| -------- | -------------- |-------------------|------|
| fixA | 11 |NA |1 |
| fixB | 11 |15 |1 |
| fixC | 11 |51 |2 |
| fixX | 11 |-3 |2 |
| kefF | 11 |108 |3 |
| kefC | 11 |-7 |3 |
| apaH | 12 |NA |4 |
| apaG | 12 |7 |4 |
我正在考虑使用 with, rle, rep, seq_along 但我不知道该怎么做。提前致谢!
dput(head(e_coli_operon,10))
structure(list(name = c("thrA", "thrB", "thrC", "yaaW", "yaaI",
"mokC", "hokC", "insB", "insA", "yaaY"), start = c(337, 2801,
3734, 10643, 11382, 16751, 16751, 19811, 20233, 21181), end = c(2799,
3733, 5020, 11356, 11786, 16960, 16903, 20314, 20508, 21399),
strand = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1), length = c(820L,
310L, 428L, 237L, 134L, 69L, 50L, 167L, 91L, 72L), pid = c(16127996L,
16127997L, 16127998L, 16128005L, 16128007L, 16128012L, 49175991L,
16128015L, 16128016L, 16128018L), gene = c("thrA", "thrB",
"thrC", "yaaW", "yaaI", "mokC", "hokC", "insB", "insA", "yaaY"
), synonym = c("b0002", "b0003", "b0004", "b0011", "b0013",
"b0018", "b4412", "b0021", "b0022", "b0024"), code = c("-",
"-", "-", "-", "-", "-", "-", "-", "-", "-"), cog = c("COG0527E",
"COG0083E", "COG0498E", "COG4735S", "-", "-", "-", "COG1662L",
"COG3677L", "-"), product = c("fused aspartokinase I and homoserine dehydrogenase I",
"homoserine kinase", "threonine synthase", "conserved protein, UPF0174 family",
"conserved protein, UPF0412 family", "regulatory protein for HokC, overlaps CDS of hokC",
"toxic membrane protein, small", "IS1 transposase B", "IS1 repressor TnpA",
"predicted protein"), col = c("blue", "blue", "blue", "blue",
"blue", "blue", "blue", "blue", "blue", "blue"), fill = c("blue",
"blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue",
"blue"), lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lwd = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), pch = c(8, 8, 8, 8, 8, 8, 8,
8, 8, 8), cex = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gene_type = c("arrows",
"arrows", "arrows", "arrows", "arrows", "arrows", "arrows",
"arrows", "arrows", "arrows"), directon = c("1", "1", "1",
"4", "4", "6", "6", "8", "8", "9"), intergenic_distance = c(82,
2, 1, 149, 26, NA, -209, NA, -81, NA)), row.names = c(NA,
-10L), groups = structure(list(directon = c("1", "4", "6", "8",
"9"), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
您可以在以下情况下将 operon
值递增 1:
intergenic_distance
大于 50 并且不是NA
或- 当前
directon
值与之前的directon
值不同。
library(dplyr)
df %>%
mutate(operon = cumsum(intergenic_distance > 50 & !is.na(intergenic_distance)|
directon != lag(directon, default = first(directon))) + 1)
# Gene directon intergenic_distance operon
#1 fixA 11 NA 1
#2 fixB 11 15 1
#3 fixC 11 51 2
#4 fixX 11 -3 2
#5 kefF 11 108 3
#6 kefC 11 -7 3
#7 apaH 12 NA 4
#8 apaG 12 7 4
数据
df <- structure(list(Gene = c("fixA", "fixB", "fixC", "fixX", "kefF",
"kefC", "apaH", "apaG"), directon = c(11L, 11L, 11L, 11L, 11L,
11L, 12L, 12L), intergenic_distance = c(NA, 15L, 51L, -3L, 108L,
-7L, NA, 7L)), row.names = c(NA, -8L), class = "data.frame")