创建一个 table 记录R中不为零的行对数

Create a table that record the number of row pairs that are not zero in R

如果标题令人困惑,我们深表歉意,但下面是我想要完成的。假设我有一个 table,如下所示。

df <- data.frame(
  patient = paste0("patient",seq(1:6)),
  gene_1 = c(10,5,0,0,1,0),
  gene_2 = c(0,26,4,5,6,1),
  gene_3 = c(1,3,5,12,44,1)
)
patient gene_1 gene_2 gene_3
patient1 10 0 1
patient2 5 26 3
patient3 0 4 5
patient4 0 5 12
patient5 1 6 44
patient6 0 1 1

我想要的是另一个table,它只在两个值都为non-zero的情况下才记录对的总数。 table 看起来像这样:

col1 col2 number-of-pairs
gene1 gene2 2
gene1 gene3 3
gene2 gene3 5

感谢任何帮助。谢谢。

我们可以通过将您的数据转换为长格式,执行 self-join,然后过滤来实现此目的:

library(tidyr)
library(dplyr)
## Long format, keep only non-zeros
long_data = pivot_longer(df, -patient) %>%
  filter(value != 0) %>%
  select(-value)

## Self join on patient,
## Remove exact matches (can't pair with yourself)
## And use < to remove doublecounts
long_data %>%
  left_join(long_data, by = "patient") %>%
  filter(name.x != name.y & name.x < name.y) %>%
  count(name.x, name.y)
# # A tibble: 3 × 3
#   name.x name.y     n
#   <chr>  <chr>  <int>
# 1 gene_1 gene_2     2
# 2 gene_1 gene_3     3
# 3 gene_2 gene_3     5

它给出了您需要的结果,但是,由于过程的原因,我不确定它是否适合您的情况。

gene1_gene2 = df %>% filter(gene_1 != 0 & gene_2 !=0) %>% count() %>% rename(number_of_pairs = n)

gene1_gene3 = df %>% filter(gene_1 != 0 & gene_3 !=0) %>% count() %>% rename(number_of_pairs = n)

gene2_gene3 = df %>% filter(gene_2 != 0 & gene_3 !=0) %>% count() %>% rename(number_of_pairs = n)
number_of_pairs = rbind(gene1_gene2, gene1_gene3, gene2_gene3)

new_df = data.frame(
  col1 = c("gene1", "gene1", "gene2"),
  col2 = c("gene2", "gene3", "gene3"))

new_df$number_of_pairs = number_of_pairs

new_df
  col1  col2 number_of_pairs
1 gene1 gene2               2
2 gene1 gene3               3
3 gene2 gene3               5

您可以使用 combn:

在不间断的管道中执行此操作
library(tidyverse) 

df %>%
  pivot_longer(-1) %>%
  filter(value > 0) %>%
  group_by(patient) %>%
  summarize(pairs = apply(combn(name, 2), 2, paste, collapse = ' '),
            .groups = 'drop') %>%
  separate(pairs, sep = ' ', into = c('col1', 'col2')) %>%
  count(col1, col2)
#>  # A tibble: 3 x 3
#>    col1   col2       n
#>    <chr>  <chr>  <int>
#>  1 gene_1 gene_2     2
#>  2 gene_1 gene_3     3
#>  3 gene_2 gene_3     5

您可以做一个简单的 for 循环,在其中访问 df 的每一列,将这些列强制转换为 > 0 的逻辑向量,然后使用 & 运算符在两者中找到所有 >0 的位置。如果您不知道,可以在逻辑向量上使用 sum 来计算有多少个 TRUE 值。

df <- data.frame(
  patient = paste0("patient",seq(1:6)),
  gene_1 = c(10,5,0,0,1,0),
  gene_2 = c(0,26,4,5,6,1),
  gene_3 = c(1,3,5,12,44,1)
)
gene_cols <- setdiff(colnames(df), "patient")
# Generate all the combinations
out <- as.data.frame(t(combn(gene_cols, 2)))
pairs <- vector("integer", nrow(out))
for (i in seq_len(length(pairs))) {
  pairs[i] <- sum(df[[out$V1[i]]]>0 & df[[out$V2[i]]]>0)
}
out$n_pairs <- pairs
out
#>       V1     V2 n_pairs
#> 1 gene_1 gene_2       2
#> 2 gene_1 gene_3       3
#> 3 gene_2 gene_3       5

reprex package (v2.0.1)

于 2022-04-07 创建

A one-liner 基础 R 方式:

table(unlist(apply(df[-1], 1, \(x) combn(names(x)[x != 0], m = 2, toString))))

# gene_1, gene_2 gene_1, gene_3 gene_2, gene_3 
#              2              3              5 

您可以使用它来获得预期的输出:

tibble(col = unlist(apply(df[-1], 1, \(x) combn(names(x)[x != 0], m = 2, toString)))) %>% 
  separate(col, into = c("col1", "col2"), sep = ", ") %>% 
  count(col1, col2)

# A tibble: 3 x 3
  col1   col2       n
  <chr>  <chr>  <int>
1 gene_1 gene_2     2
2 gene_1 gene_3     3
3 gene_2 gene_3     5

这是另一种基本的 R 方法。虽然看起来不太优雅,但实际上是目前为止效率最高的答案。

首先创建一个包含基因对的combn_gene载体。然后使用 sapply 遍历基因对的所有组合,看看该对的总和是否等于原始值 gene (因为如果该列包含 0,则总和将与原始值相同).然后计算求和后具有不同值的对(即列具有 non-zero 个值)。

combn_gene <- t(combn(colnames(df)[-1], 2))

cbind(setNames(as.data.frame(combn_gene), c("col1", "col2")), 
      "number-of-pairs" = sapply(1:nrow(combn_gene), function(x) 
        colSums(
          !(
            (df[combn_gene[x, 1]] == df[combn_gene[x, 1]] + df[combn_gene[x, 2]]) | 
              (df[combn_gene[x, 2]] == df[combn_gene[x, 1]] + df[combn_gene[x, 2]])
            )
          ))
      )

    col1   col2 number-of-pairs
1 gene_1 gene_2               2
2 gene_1 gene_3               3
3 gene_2 gene_3               5

另一个tidyverse方法可能是:

map_dfr(.x = combn(names(select(df, starts_with("gene"))), 2, simplify = FALSE),
        ~ df %>%
            summarise(col1 = first(.x),
                      col2 = last(.x),
                      number = sum(rowSums(across(all_of(.x)) != 0) == 2)))

    col1   col2 number
1 gene_1 gene_2      2
2 gene_1 gene_3      3
3 gene_2 gene_3      5