R：将数据分组为无向源目标格式以实现网络可视化

Question

我有这个数据集：

test.set <- read.table(text = "
                            cluster.id   letters  cat
                       1          5       A       pink
                       2          4       B       blue
                       3          4       B       blue
                       4          3       A       pink
                       5          3       E       pink
                       6          3       D       pink
                       7          3       C       pink
                       8          2       A       blue
                       9          2       E       blue
                       10         1       A       green
                       11         0       A       pink
                       12         0       E       pink", header = T, stringsAsFactors = F)

我感兴趣的是哪些字母在哪个簇中结束并记住它们属于哪个 cat。

只有包含超过 1 个唯一字母的簇才相关（例如，簇 4 仅包含字母 B，因此不相关）。让我们先过滤掉所有至少有两个不同字母的簇：

 x <- test.set %>% group_by(cluster.id) %>%
      mutate(letter.count = n_distinct(letters)) %>%
      filter(letter.count > 1) %>% 
      ungroup()



        cluster.id letters   cat letter.count
       <int>   <chr> <chr>        <int>
1          3       A  pink            4
2          3       E  pink            4
3          3       D  pink            4
4          3       C  pink            4
5          2       A  blue            2
6          2       E  blue            2
7          0       A  pink            2
8          0       E  pink            2

然后我想从中构建一个源目标 matrix/data 框架。生成的网络将是一个无向网络，所以我只想要像 A - E 这样的关系（也不包括 E - A）。这将导致以下 source/target 矩阵（如果我手动正确完成；））：

source  target  weight  cat

A   E   2   pink
A   E   1   blue
A   D   1   pink
A   C   1   pink
E   D   1   pink
E   C   1   pink
D   C   1   pink

我确定有一个库（或一个非常简单的技巧）可以执行此操作，但我无法弄清楚。有什么 smart/simple 方法可以做到这一点吗？

半个伪代码示例：

# for each cluster in test_set
# get all unique pairwise combinations for the letters:
combinations = unique(expand.grid(letters,letters)) %>%
  filter( .$Var1 != .$Var2 ) #removes self,self combinations
combinations = combinations[!duplicated(apply(combinations,1,function(x) paste(sort(x),collapse=''))),]
# check whether the letter combination + cat is already in the data frame
# if not add it with weight = 1 (e.g. source: A, target: E,  weight: 1, cat: pink)
# else increase the weight by 1 (e.g. source:A, target: E, weigth + 1, cat: pink)

对评论的反应
@单簧管演奏家
也许我不清楚，但您的代码包含错误。

txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10469\tescherichia coli\tundefined
10469\tmycobacterium tuberculosis\tundefined"

txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10\tescherichia coli\tundefined
10\tmycobacterium tuberculosis\tundefined"

第一个工作正常：

# A tibble: 1 x 4
            source                     target           cat weight
            <fctr>                     <fctr>         <chr>  <dbl>
1 escherichia coli mycobacterium tuberculosis 1046undefined      2

（还是那个数字，1046很奇怪）

第二个应该产生相同的结果（我只是将 id 从 10469 更改为 10），但是它产生了错误的结果：

# A tibble: 2 x 4
            source                     target           cat weight
            <fctr>                     <fctr>         <chr>  <dbl>
1 escherichia coli mycobacterium tuberculosis 1046undefined      1
2 escherichia coli mycobacterium tuberculosis    1undefined      1

问题似乎是cat列前面的数字

Answer 1

有可能更好的方法。

这是我的尝试：

library(dplyr)
library(gtools)
library(tidyr)

test.set <- read.table(text = "
                            cluster.id   letters  cat
                       1          5       A       pink
                       2          4       B       blue
                       3          4       B       blue
                       4          3       A       pink
                       5          3       E       pink
                       6          3       D       pink
                       7          3       C       pink
                       8          2       A       blue
                       9          2       E       blue
                       10         1       A       green
                       11         0       A       pink
                       12         0       E       pink", header = T, stringsAsFactors = F)

x <- test.set %>% 
  group_by(cluster.id) %>%
  mutate(letter.count = n_distinct(letters)) %>%
  filter(letter.count > 1) %>% 
  select(-letter.count) %>%
  ungroup() %>%
  mutate(id = paste(cluster.id, cat),
         ind = 1) %>%
  select(-cluster.id, -cat) %>%
  spread(key = letters, value = ind) %>%
  as.data.frame()

row.names(x) <- x$id
x$id <- NULL

for (i in 1:nrow(x)){
  cluster <- x[i,]
  letters <- names(cluster)[which(!is.na(cluster))]
  comb <- combinations(n = length(letters),
                       r = 2,
                       v = letters) %>%
    as.data.frame() %>%
    rename(source = V1,
           target = V2)
  cat <- sub(". ", "", dimnames(cluster)[[1]])
  comb$cat <- cat
  rm(cat)
  comb$weight <- 0
  cluster_cols <- dimnames(cluster)[[2]]
  for (j in 1:nrow(comb)){
    comb$weight[j] <- min(cluster[cluster_cols == comb$source[j]],
                          cluster[cluster_cols == comb$target[j]]) + comb$weight[j]
  }
  rm(j)
  rm(cluster_cols)
  if (i == 1){
    final <- comb
    rm(comb)
  } else {
    final <- rbind(final, comb)
    rm(comb)
  }
}

rm(i)

final <- final %>%
  group_by(source, target, cat) %>%
  summarize(weight = sum(weight)) %>%
  ungroup()

> final
# A tibble: 7 x 4
  source target   cat weight
  <fctr> <fctr> <chr>  <dbl>
1      A      E  blue      1
2      A      E  pink      2
3      A      C  pink      1
4      A      D  pink      1
5      C      E  pink      1
6      C      D  pink      1
7      D      E  pink      1

使用更新后的数据集（将cat <- sub(". ", "", dimnames(cluster)[[1]])更改为cat <- sub(".* ", "", dimnames(cluster)[[1]])）：

    txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10\tescherichia coli\tundefined
10\tmycobacterium tuberculosis\tundefined"

library(dplyr)
library(gtools)
library(tidyr)
library(stringr)

test.set <- read.table(text = txt, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
  as.data.frame()
rm(txt)

x <- test.set %>% 
  group_by(cluster.id) %>%
  mutate(letter.count = n_distinct(letters)) %>%
  filter(letter.count > 1) %>% 
  select(-letter.count) %>%
  ungroup() %>%
  mutate(id = paste(cluster.id, cat),
         ind = 1) %>%
  select(-cluster.id, -cat) %>%
  spread(key = letters, value = ind) %>%
  as.data.frame()

row.names(x) <- x$id
x$id <- NULL

for (i in 1:nrow(x)){
  cluster <- x[i,]
  letters <- names(cluster)[which(!is.na(cluster))]
  comb <- combinations(n = length(letters),
                       r = 2,
                       v = letters) %>%
    as.data.frame() %>%
    rename(source = V1,
           target = V2)
  cat <- sub(".* ", "", dimnames(cluster)[[1]])
  comb$cat <- cat
  rm(cat)
  comb$weight <- 0
  cluster_cols <- dimnames(cluster)[[2]]
  for (j in 1:nrow(comb)){
    comb$weight[j] <- min(cluster[cluster_cols == comb$source[j]],
                          cluster[cluster_cols == comb$target[j]]) + comb$weight[j]
  }
  rm(j)
  rm(cluster_cols)
  if (i == 1){
    final <- comb
    rm(comb)
  } else {
    final <- rbind(final, comb)
    rm(comb)
  }
}

rm(i)

final <- final %>%
  group_by(source, target, cat) %>%
  summarize(weight = sum(weight)) %>%
  ungroup()

输出：

# A tibble: 1 x 4
            source                     target       cat weight
            <fctr>                     <fctr>     <chr>  <dbl>
1 escherichia coli mycobacterium tuberculosis undefined      2

R：将数据分组为无向源目标格式以实现网络可视化

R: grouped data to undirected source target format for network visualization

networking

r

distance