根据关联的数字对字符串中唯一出现的事件求和
Sum unique occurences in a string based on associated number
我使用单序列读取分类,并希望根据分类质量进行过滤。但是,需要更改输出格式才能执行此操作。我有一个分类统计数据(分数),如下所示,代表 ["taxonomy":"kmers assigned to that taxonomy" "taxonomy":"kmers assigned to that taxonomy" etc.],每个分类法可以出现多个次。
classification_stats<-c("3:1 7:4 0:34 3:7 0:27",
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47")
read_ID<-c("read1", "read2", "read3")
df<-data.frame(read_ID, classification_stats)
> df
read_ID classification_stats
1 read1 3:1 7:4 0:34 3:7 0:27
2 read2 0:110 561:19 0:37
3 read3 0:3 562:5 0:7 543:55 0:47
对于每次读取(每行),我想计算分配给分类法的 kmers 总数(在 classification_stats 中),但由于每个分类法不连续出现多次,这变得更加困难。这意味着例如read1 分类法 3 有 1+7 个 kmers,分类法 7 有 4 个 kmers,分类法 0 有 34 + 27 个 kmers。
我想要的输出看起来像这样,最好排序以便 tax1 是具有最多 kmers 的分类法。
read_ID classification_stats tax1 kmer1 tax2 kmer2 tax3 kmer3
read1 3:1 7:4 0:34 3:7 0:27 0 61 3 8 7 4
read2 0:110 561:19 0:37 0 147 561 19 NA NA
read3 0:3 562:5 0:7 543:55 0:47 0 57 543 55 562 5
R 或 bash 解决方案都很有趣。
library(tidyverse)
classification_stats <- c(
"3:1 7:4 0:34 3:7 0:27",
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47"
)
read_ID <- c("read1", "read2", "read3")
df <- tibble(read_ID, classification_stats)
df %>%
separate_rows(classification_stats, sep = " ") %>%
separate(classification_stats, into = c("tax", "kmer")) %>%
type_convert() %>%
arrange(-kmer) %>%
nest(tax) %>%
mutate(id = row_number()) %>%
unnest(data) %>%
pivot_wider(names_from = id, values_from = c(kmer, tax))
#> Warning: All elements of `...` must be named.
#> Did you want `data = tax`?
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> read_ID = col_character(),
#> tax = col_double(),
#> kmer = col_double()
#> )
#> # A tibble: 3 × 27
#> read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 kmer_7 kmer_8 kmer_9 kmer_10
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 read2 110 NA NA 37 NA NA 19 NA NA NA
#> 2 read3 NA 55 47 NA NA NA NA NA 7 5
#> 3 read1 NA NA NA NA 34 27 NA 7 NA NA
#> # … with 16 more variables: kmer_11 <dbl>, kmer_12 <dbl>, kmer_13 <dbl>,
#> # tax_1 <dbl>, tax_2 <dbl>, tax_3 <dbl>, tax_4 <dbl>, tax_5 <dbl>,
#> # tax_6 <dbl>, tax_7 <dbl>, tax_8 <dbl>, tax_9 <dbl>, tax_10 <dbl>,
#> # tax_11 <dbl>, tax_12 <dbl>, tax_13 <dbl>
由 reprex package (v2.0.0)
于 2022 年 3 月 10 日创建
这里计算的是每次读取的 kmer 或分类单元出现的频率:
df %>%
separate_rows(classification_stats, sep = " ") %>%
separate(classification_stats, into = c("tax", "kmer")) %>%
type_convert() %>%
arrange(-kmer) %>%
nest(-tax) %>%
mutate(id = row_number()) %>%
unnest(data) %>%
pivot_wider(names_from = id, values_from = c(kmer, tax), values_fn = length)
#> Warning: All elements of `...` must be named.
#> Did you want `data = -tax`?
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> read_ID = col_character(),
#> tax = col_double(),
#> kmer = col_double()
#> )
#> # A tibble: 3 × 13
#> read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 tax_1 tax_2 tax_3 tax_4
#> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 read2 2 NA 1 NA NA NA 2 NA 1 NA
#> 2 read3 3 1 NA NA 1 NA 3 1 NA NA
#> 3 read1 2 NA NA 2 NA 1 2 NA NA 2
#> # … with 2 more variables: tax_5 <int>, tax_6 <int>
这是 R 中的另一个解决方案。请注意,我在 read 3
中做了一些更改以测试我的代码(543:55
到 543:80
)
输入
df
read_ID classification_stats
1 read1 3:1 7:4 0:34 3:7 0:27
2 read2 0:110 561:19 0:37
3 read3 0:3 562:5 0:7 543:80 0:47
代码和输出
- 将
classification_stats
列分成单独的行,以白色 space 作为分隔符。
- 将结果列一分为二,用冒号“:”分隔。
- 然后创建一个
unique_tax
列,其中包含每个read_ID
中的tax
个数。这将用作 pivot_wider
中列名的一部分(告诉我们应该生成多少对 kmer
和 tax
)。
group_by(read_ID, tax)
以便一切都在该级别上运行
summarise
kmer
列汇总相同 read_ID
和 tax
下的所有值
- 先按
read_ID
排列数据,再按kmer
排列数据,这样我们就可以用row_number()
生成正确的索引对
- 一个
pivot_wider
在这个版本的代码中就足够了
left_join
合并 classification_stats
列
- 将列重新排序到您想要的位置
library(tidyverse)
left_join(
df %>%
separate_rows(!read_ID, sep = " ") %>%
separate(classification_stats, into = c("tax", "kmer")) %>%
group_by(read_ID, tax) %>%
summarize(kmer = sum(as.numeric(kmer))) %>%
arrange(read_ID, desc(kmer)) %>%
mutate(unique_tax = row_number()) %>%
pivot_wider(everything(), names_from = "unique_tax", values_from = c(tax, kmer)),
df,
by = "read_ID"
) %>%
select(read_ID, classification_stats, tax_1, kmer_1, tax_2, kmer_2, tax_3, kmer_3)
# A tibble: 3 × 8
# Groups: read_ID [3]
read_ID classification_stats tax_1 kmer_1 tax_2 kmer_2 tax_3 kmer_3
<chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 read1 3:1 7:4 0:34 3:7 0:27 0 61 3 8 7 4
2 read2 0:110 561:19 0:37 0 147 561 19 NA NA
3 read3 0:3 562:5 0:7 543:80 … 543 80 0 57 562 5
解决方案使用data.table
library(data.table)
setDT(df) # make the tibble a data.table
dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]
dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]
dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)]
dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]
dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"), sep = "")
setcolorder(dt, c("read_ID", paste0(c("tax", "kmer"), rep(1:((ncol(dt)-1)/2), each = 2))))
结果
dt
# read_ID tax1 kmer1 tax2 kmer2 tax3 kmer3
# 1: read1 0 61 3 8 7 4
# 2: read2 0 147 561 19 <NA> NA
# 3: read3 0 57 543 55 562 5
数据
classification_stats <- c(
"3:1 7:4 0:34 3:7 0:27",
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47"
)
read_ID <- c("read1", "read2", "read3")
df <- tibble(read_ID, classification_stats)
分解代码
dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]
我们在这里为您的输出创建一个新的 table,并确保每一行都包含您的每个 ID 组的一对“tax:kmer”。
dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]
在这里,我们将您的 tax_kmer 对分成两个不同的列,以保存税值和 kmer 值。
dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)]
总结每个 tax-ID 组合的 kmer 值。
dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]
确定什么税成为 tax1、tax2 等的顺序的最安全方法是使用我存储在列“顺序”中的 kmer 总和(-1 从高到低的排名) “
dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"), sep = "")
dcast 是“pivot_wider”的 data.table 变体,注意我们使用 order 进行 dcast 并使用来自 kmer 和 tax 的值。
setcolorder(dt, c("read_ID", paste0(c("tax", "kmer"), rep(1:((ncol(dt)-1)/2), each = 2))))
由于我们的 dcast 程序使用 id 对您的 table 列进行排序,然后是所有 tax 列,然后是所有 kmer 列,我们根据您的喜好动态设置顺序。我们查看列的总数(我们事先不知道我们找到了多少税?)并为 ID 减去一个,根据定义我们将它除以 2,因为它是 tax/kmer 的交替对。这将创建此向量 [1] "read_ID" "tax1" "kmer1" "tax2" "kmer2" "tax3" "kmer3"
,这正是我们要设置的列的顺序。
我使用单序列读取分类,并希望根据分类质量进行过滤。但是,需要更改输出格式才能执行此操作。我有一个分类统计数据(分数),如下所示,代表 ["taxonomy":"kmers assigned to that taxonomy" "taxonomy":"kmers assigned to that taxonomy" etc.],每个分类法可以出现多个次。
classification_stats<-c("3:1 7:4 0:34 3:7 0:27",
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47")
read_ID<-c("read1", "read2", "read3")
df<-data.frame(read_ID, classification_stats)
> df
read_ID classification_stats
1 read1 3:1 7:4 0:34 3:7 0:27
2 read2 0:110 561:19 0:37
3 read3 0:3 562:5 0:7 543:55 0:47
对于每次读取(每行),我想计算分配给分类法的 kmers 总数(在 classification_stats 中),但由于每个分类法不连续出现多次,这变得更加困难。这意味着例如read1 分类法 3 有 1+7 个 kmers,分类法 7 有 4 个 kmers,分类法 0 有 34 + 27 个 kmers。
我想要的输出看起来像这样,最好排序以便 tax1 是具有最多 kmers 的分类法。
read_ID classification_stats tax1 kmer1 tax2 kmer2 tax3 kmer3
read1 3:1 7:4 0:34 3:7 0:27 0 61 3 8 7 4
read2 0:110 561:19 0:37 0 147 561 19 NA NA
read3 0:3 562:5 0:7 543:55 0:47 0 57 543 55 562 5
R 或 bash 解决方案都很有趣。
library(tidyverse)
classification_stats <- c(
"3:1 7:4 0:34 3:7 0:27",
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47"
)
read_ID <- c("read1", "read2", "read3")
df <- tibble(read_ID, classification_stats)
df %>%
separate_rows(classification_stats, sep = " ") %>%
separate(classification_stats, into = c("tax", "kmer")) %>%
type_convert() %>%
arrange(-kmer) %>%
nest(tax) %>%
mutate(id = row_number()) %>%
unnest(data) %>%
pivot_wider(names_from = id, values_from = c(kmer, tax))
#> Warning: All elements of `...` must be named.
#> Did you want `data = tax`?
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> read_ID = col_character(),
#> tax = col_double(),
#> kmer = col_double()
#> )
#> # A tibble: 3 × 27
#> read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 kmer_7 kmer_8 kmer_9 kmer_10
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 read2 110 NA NA 37 NA NA 19 NA NA NA
#> 2 read3 NA 55 47 NA NA NA NA NA 7 5
#> 3 read1 NA NA NA NA 34 27 NA 7 NA NA
#> # … with 16 more variables: kmer_11 <dbl>, kmer_12 <dbl>, kmer_13 <dbl>,
#> # tax_1 <dbl>, tax_2 <dbl>, tax_3 <dbl>, tax_4 <dbl>, tax_5 <dbl>,
#> # tax_6 <dbl>, tax_7 <dbl>, tax_8 <dbl>, tax_9 <dbl>, tax_10 <dbl>,
#> # tax_11 <dbl>, tax_12 <dbl>, tax_13 <dbl>
由 reprex package (v2.0.0)
于 2022 年 3 月 10 日创建这里计算的是每次读取的 kmer 或分类单元出现的频率:
df %>%
separate_rows(classification_stats, sep = " ") %>%
separate(classification_stats, into = c("tax", "kmer")) %>%
type_convert() %>%
arrange(-kmer) %>%
nest(-tax) %>%
mutate(id = row_number()) %>%
unnest(data) %>%
pivot_wider(names_from = id, values_from = c(kmer, tax), values_fn = length)
#> Warning: All elements of `...` must be named.
#> Did you want `data = -tax`?
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> read_ID = col_character(),
#> tax = col_double(),
#> kmer = col_double()
#> )
#> # A tibble: 3 × 13
#> read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 tax_1 tax_2 tax_3 tax_4
#> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 read2 2 NA 1 NA NA NA 2 NA 1 NA
#> 2 read3 3 1 NA NA 1 NA 3 1 NA NA
#> 3 read1 2 NA NA 2 NA 1 2 NA NA 2
#> # … with 2 more variables: tax_5 <int>, tax_6 <int>
这是 R 中的另一个解决方案。请注意,我在 read 3
中做了一些更改以测试我的代码(543:55
到 543:80
)
输入
df
read_ID classification_stats
1 read1 3:1 7:4 0:34 3:7 0:27
2 read2 0:110 561:19 0:37
3 read3 0:3 562:5 0:7 543:80 0:47
代码和输出
- 将
classification_stats
列分成单独的行,以白色 space 作为分隔符。 - 将结果列一分为二,用冒号“:”分隔。
- 然后创建一个
unique_tax
列,其中包含每个read_ID
中的tax
个数。这将用作pivot_wider
中列名的一部分(告诉我们应该生成多少对kmer
和tax
)。 group_by(read_ID, tax)
以便一切都在该级别上运行summarise
kmer
列汇总相同read_ID
和tax
下的所有值
- 先按
read_ID
排列数据,再按kmer
排列数据,这样我们就可以用row_number()
生成正确的索引对 - 一个
pivot_wider
在这个版本的代码中就足够了 left_join
合并classification_stats
列- 将列重新排序到您想要的位置
library(tidyverse)
left_join(
df %>%
separate_rows(!read_ID, sep = " ") %>%
separate(classification_stats, into = c("tax", "kmer")) %>%
group_by(read_ID, tax) %>%
summarize(kmer = sum(as.numeric(kmer))) %>%
arrange(read_ID, desc(kmer)) %>%
mutate(unique_tax = row_number()) %>%
pivot_wider(everything(), names_from = "unique_tax", values_from = c(tax, kmer)),
df,
by = "read_ID"
) %>%
select(read_ID, classification_stats, tax_1, kmer_1, tax_2, kmer_2, tax_3, kmer_3)
# A tibble: 3 × 8
# Groups: read_ID [3]
read_ID classification_stats tax_1 kmer_1 tax_2 kmer_2 tax_3 kmer_3
<chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 read1 3:1 7:4 0:34 3:7 0:27 0 61 3 8 7 4
2 read2 0:110 561:19 0:37 0 147 561 19 NA NA
3 read3 0:3 562:5 0:7 543:80 … 543 80 0 57 562 5
解决方案使用data.table
library(data.table)
setDT(df) # make the tibble a data.table
dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]
dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]
dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)]
dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]
dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"), sep = "")
setcolorder(dt, c("read_ID", paste0(c("tax", "kmer"), rep(1:((ncol(dt)-1)/2), each = 2))))
结果
dt
# read_ID tax1 kmer1 tax2 kmer2 tax3 kmer3
# 1: read1 0 61 3 8 7 4
# 2: read2 0 147 561 19 <NA> NA
# 3: read3 0 57 543 55 562 5
数据
classification_stats <- c(
"3:1 7:4 0:34 3:7 0:27",
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47"
)
read_ID <- c("read1", "read2", "read3")
df <- tibble(read_ID, classification_stats)
分解代码
dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]
我们在这里为您的输出创建一个新的 table,并确保每一行都包含您的每个 ID 组的一对“tax:kmer”。
dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]
在这里,我们将您的 tax_kmer 对分成两个不同的列,以保存税值和 kmer 值。
dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)]
总结每个 tax-ID 组合的 kmer 值。
dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]
确定什么税成为 tax1、tax2 等的顺序的最安全方法是使用我存储在列“顺序”中的 kmer 总和(-1 从高到低的排名) “
dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"), sep = "")
dcast 是“pivot_wider”的 data.table 变体,注意我们使用 order 进行 dcast 并使用来自 kmer 和 tax 的值。
setcolorder(dt, c("read_ID", paste0(c("tax", "kmer"), rep(1:((ncol(dt)-1)/2), each = 2))))
由于我们的 dcast 程序使用 id 对您的 table 列进行排序,然后是所有 tax 列,然后是所有 kmer 列,我们根据您的喜好动态设置顺序。我们查看列的总数(我们事先不知道我们找到了多少税?)并为 ID 减去一个,根据定义我们将它除以 2,因为它是 tax/kmer 的交替对。这将创建此向量 [1] "read_ID" "tax1" "kmer1" "tax2" "kmer2" "tax3" "kmer3"
,这正是我们要设置的列的顺序。