R:将数据分组为无向源目标格式以实现网络可视化
R: grouped data to undirected source target format for network visualization
我有这个数据集:
test.set <- read.table(text = "
cluster.id letters cat
1 5 A pink
2 4 B blue
3 4 B blue
4 3 A pink
5 3 E pink
6 3 D pink
7 3 C pink
8 2 A blue
9 2 E blue
10 1 A green
11 0 A pink
12 0 E pink", header = T, stringsAsFactors = F)
我感兴趣的是哪些字母在哪个簇中结束并记住它们属于哪个 cat
。
只有包含超过 1 个唯一字母的簇才相关(例如,簇 4 仅包含字母 B,因此不相关)。让我们先过滤掉所有至少有两个不同字母的簇:
x <- test.set %>% group_by(cluster.id) %>%
mutate(letter.count = n_distinct(letters)) %>%
filter(letter.count > 1) %>%
ungroup()
cluster.id letters cat letter.count
<int> <chr> <chr> <int>
1 3 A pink 4
2 3 E pink 4
3 3 D pink 4
4 3 C pink 4
5 2 A blue 2
6 2 E blue 2
7 0 A pink 2
8 0 E pink 2
然后我想从中构建一个源目标 matrix/data 框架。生成的网络将是一个无向网络,所以我只想要像 A - E 这样的关系(也不包括 E - A)。这将导致以下 source/target 矩阵(如果我手动正确完成;)):
source target weight cat
A E 2 pink
A E 1 blue
A D 1 pink
A C 1 pink
E D 1 pink
E C 1 pink
D C 1 pink
我确定有一个库(或一个非常简单的技巧)可以执行此操作,但我无法弄清楚。有什么 smart/simple 方法可以做到这一点吗?
半个伪代码示例:
# for each cluster in test_set
# get all unique pairwise combinations for the letters:
combinations = unique(expand.grid(letters,letters)) %>%
filter( .$Var1 != .$Var2 ) #removes self,self combinations
combinations = combinations[!duplicated(apply(combinations,1,function(x) paste(sort(x),collapse=''))),]
# check whether the letter combination + cat is already in the data frame
# if not add it with weight = 1 (e.g. source: A, target: E, weight: 1, cat: pink)
# else increase the weight by 1 (e.g. source:A, target: E, weigth + 1, cat: pink)
对评论的反应
@单簧管演奏家
也许我不清楚,但您的代码包含错误。
txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10469\tescherichia coli\tundefined
10469\tmycobacterium tuberculosis\tundefined"
txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10\tescherichia coli\tundefined
10\tmycobacterium tuberculosis\tundefined"
第一个工作正常:
# A tibble: 1 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 escherichia coli mycobacterium tuberculosis 1046undefined 2
(还是那个数字,1046很奇怪)
第二个应该产生相同的结果(我只是将 id 从 10469 更改为 10),但是它产生了错误的结果:
# A tibble: 2 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 escherichia coli mycobacterium tuberculosis 1046undefined 1
2 escherichia coli mycobacterium tuberculosis 1undefined 1
问题似乎是cat
列前面的数字
有可能更好的方法。
这是我的尝试:
library(dplyr)
library(gtools)
library(tidyr)
test.set <- read.table(text = "
cluster.id letters cat
1 5 A pink
2 4 B blue
3 4 B blue
4 3 A pink
5 3 E pink
6 3 D pink
7 3 C pink
8 2 A blue
9 2 E blue
10 1 A green
11 0 A pink
12 0 E pink", header = T, stringsAsFactors = F)
x <- test.set %>%
group_by(cluster.id) %>%
mutate(letter.count = n_distinct(letters)) %>%
filter(letter.count > 1) %>%
select(-letter.count) %>%
ungroup() %>%
mutate(id = paste(cluster.id, cat),
ind = 1) %>%
select(-cluster.id, -cat) %>%
spread(key = letters, value = ind) %>%
as.data.frame()
row.names(x) <- x$id
x$id <- NULL
for (i in 1:nrow(x)){
cluster <- x[i,]
letters <- names(cluster)[which(!is.na(cluster))]
comb <- combinations(n = length(letters),
r = 2,
v = letters) %>%
as.data.frame() %>%
rename(source = V1,
target = V2)
cat <- sub(". ", "", dimnames(cluster)[[1]])
comb$cat <- cat
rm(cat)
comb$weight <- 0
cluster_cols <- dimnames(cluster)[[2]]
for (j in 1:nrow(comb)){
comb$weight[j] <- min(cluster[cluster_cols == comb$source[j]],
cluster[cluster_cols == comb$target[j]]) + comb$weight[j]
}
rm(j)
rm(cluster_cols)
if (i == 1){
final <- comb
rm(comb)
} else {
final <- rbind(final, comb)
rm(comb)
}
}
rm(i)
final <- final %>%
group_by(source, target, cat) %>%
summarize(weight = sum(weight)) %>%
ungroup()
> final
# A tibble: 7 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 A E blue 1
2 A E pink 2
3 A C pink 1
4 A D pink 1
5 C E pink 1
6 C D pink 1
7 D E pink 1
使用更新后的数据集(将cat <- sub(". ", "", dimnames(cluster)[[1]])
更改为cat <- sub(".* ", "", dimnames(cluster)[[1]])
):
txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10\tescherichia coli\tundefined
10\tmycobacterium tuberculosis\tundefined"
library(dplyr)
library(gtools)
library(tidyr)
library(stringr)
test.set <- read.table(text = txt, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
as.data.frame()
rm(txt)
x <- test.set %>%
group_by(cluster.id) %>%
mutate(letter.count = n_distinct(letters)) %>%
filter(letter.count > 1) %>%
select(-letter.count) %>%
ungroup() %>%
mutate(id = paste(cluster.id, cat),
ind = 1) %>%
select(-cluster.id, -cat) %>%
spread(key = letters, value = ind) %>%
as.data.frame()
row.names(x) <- x$id
x$id <- NULL
for (i in 1:nrow(x)){
cluster <- x[i,]
letters <- names(cluster)[which(!is.na(cluster))]
comb <- combinations(n = length(letters),
r = 2,
v = letters) %>%
as.data.frame() %>%
rename(source = V1,
target = V2)
cat <- sub(".* ", "", dimnames(cluster)[[1]])
comb$cat <- cat
rm(cat)
comb$weight <- 0
cluster_cols <- dimnames(cluster)[[2]]
for (j in 1:nrow(comb)){
comb$weight[j] <- min(cluster[cluster_cols == comb$source[j]],
cluster[cluster_cols == comb$target[j]]) + comb$weight[j]
}
rm(j)
rm(cluster_cols)
if (i == 1){
final <- comb
rm(comb)
} else {
final <- rbind(final, comb)
rm(comb)
}
}
rm(i)
final <- final %>%
group_by(source, target, cat) %>%
summarize(weight = sum(weight)) %>%
ungroup()
输出:
# A tibble: 1 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 escherichia coli mycobacterium tuberculosis undefined 2
我有这个数据集:
test.set <- read.table(text = "
cluster.id letters cat
1 5 A pink
2 4 B blue
3 4 B blue
4 3 A pink
5 3 E pink
6 3 D pink
7 3 C pink
8 2 A blue
9 2 E blue
10 1 A green
11 0 A pink
12 0 E pink", header = T, stringsAsFactors = F)
我感兴趣的是哪些字母在哪个簇中结束并记住它们属于哪个 cat
。
只有包含超过 1 个唯一字母的簇才相关(例如,簇 4 仅包含字母 B,因此不相关)。让我们先过滤掉所有至少有两个不同字母的簇:
x <- test.set %>% group_by(cluster.id) %>%
mutate(letter.count = n_distinct(letters)) %>%
filter(letter.count > 1) %>%
ungroup()
cluster.id letters cat letter.count
<int> <chr> <chr> <int>
1 3 A pink 4
2 3 E pink 4
3 3 D pink 4
4 3 C pink 4
5 2 A blue 2
6 2 E blue 2
7 0 A pink 2
8 0 E pink 2
然后我想从中构建一个源目标 matrix/data 框架。生成的网络将是一个无向网络,所以我只想要像 A - E 这样的关系(也不包括 E - A)。这将导致以下 source/target 矩阵(如果我手动正确完成;)):
source target weight cat
A E 2 pink
A E 1 blue
A D 1 pink
A C 1 pink
E D 1 pink
E C 1 pink
D C 1 pink
我确定有一个库(或一个非常简单的技巧)可以执行此操作,但我无法弄清楚。有什么 smart/simple 方法可以做到这一点吗?
半个伪代码示例:
# for each cluster in test_set
# get all unique pairwise combinations for the letters:
combinations = unique(expand.grid(letters,letters)) %>%
filter( .$Var1 != .$Var2 ) #removes self,self combinations
combinations = combinations[!duplicated(apply(combinations,1,function(x) paste(sort(x),collapse=''))),]
# check whether the letter combination + cat is already in the data frame
# if not add it with weight = 1 (e.g. source: A, target: E, weight: 1, cat: pink)
# else increase the weight by 1 (e.g. source:A, target: E, weigth + 1, cat: pink)
对评论的反应
@单簧管演奏家
也许我不清楚,但您的代码包含错误。
txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10469\tescherichia coli\tundefined
10469\tmycobacterium tuberculosis\tundefined"
txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10\tescherichia coli\tundefined
10\tmycobacterium tuberculosis\tundefined"
第一个工作正常:
# A tibble: 1 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 escherichia coli mycobacterium tuberculosis 1046undefined 2
(还是那个数字,1046很奇怪)
第二个应该产生相同的结果(我只是将 id 从 10469 更改为 10),但是它产生了错误的结果:
# A tibble: 2 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 escherichia coli mycobacterium tuberculosis 1046undefined 1
2 escherichia coli mycobacterium tuberculosis 1undefined 1
问题似乎是cat
列前面的数字
有可能更好的方法。
这是我的尝试:
library(dplyr)
library(gtools)
library(tidyr)
test.set <- read.table(text = "
cluster.id letters cat
1 5 A pink
2 4 B blue
3 4 B blue
4 3 A pink
5 3 E pink
6 3 D pink
7 3 C pink
8 2 A blue
9 2 E blue
10 1 A green
11 0 A pink
12 0 E pink", header = T, stringsAsFactors = F)
x <- test.set %>%
group_by(cluster.id) %>%
mutate(letter.count = n_distinct(letters)) %>%
filter(letter.count > 1) %>%
select(-letter.count) %>%
ungroup() %>%
mutate(id = paste(cluster.id, cat),
ind = 1) %>%
select(-cluster.id, -cat) %>%
spread(key = letters, value = ind) %>%
as.data.frame()
row.names(x) <- x$id
x$id <- NULL
for (i in 1:nrow(x)){
cluster <- x[i,]
letters <- names(cluster)[which(!is.na(cluster))]
comb <- combinations(n = length(letters),
r = 2,
v = letters) %>%
as.data.frame() %>%
rename(source = V1,
target = V2)
cat <- sub(". ", "", dimnames(cluster)[[1]])
comb$cat <- cat
rm(cat)
comb$weight <- 0
cluster_cols <- dimnames(cluster)[[2]]
for (j in 1:nrow(comb)){
comb$weight[j] <- min(cluster[cluster_cols == comb$source[j]],
cluster[cluster_cols == comb$target[j]]) + comb$weight[j]
}
rm(j)
rm(cluster_cols)
if (i == 1){
final <- comb
rm(comb)
} else {
final <- rbind(final, comb)
rm(comb)
}
}
rm(i)
final <- final %>%
group_by(source, target, cat) %>%
summarize(weight = sum(weight)) %>%
ungroup()
> final
# A tibble: 7 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 A E blue 1
2 A E pink 2
3 A C pink 1
4 A D pink 1
5 C E pink 1
6 C D pink 1
7 D E pink 1
使用更新后的数据集(将cat <- sub(". ", "", dimnames(cluster)[[1]])
更改为cat <- sub(".* ", "", dimnames(cluster)[[1]])
):
txt <- "cluster.id\tletters\tcat
10283\tklebsiella pneumoniae\tprotein1
10463\tescherichia coli\tundefined
10463\tmycobacterium tuberculosis\tundefined
10\tescherichia coli\tundefined
10\tmycobacterium tuberculosis\tundefined"
library(dplyr)
library(gtools)
library(tidyr)
library(stringr)
test.set <- read.table(text = txt, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
as.data.frame()
rm(txt)
x <- test.set %>%
group_by(cluster.id) %>%
mutate(letter.count = n_distinct(letters)) %>%
filter(letter.count > 1) %>%
select(-letter.count) %>%
ungroup() %>%
mutate(id = paste(cluster.id, cat),
ind = 1) %>%
select(-cluster.id, -cat) %>%
spread(key = letters, value = ind) %>%
as.data.frame()
row.names(x) <- x$id
x$id <- NULL
for (i in 1:nrow(x)){
cluster <- x[i,]
letters <- names(cluster)[which(!is.na(cluster))]
comb <- combinations(n = length(letters),
r = 2,
v = letters) %>%
as.data.frame() %>%
rename(source = V1,
target = V2)
cat <- sub(".* ", "", dimnames(cluster)[[1]])
comb$cat <- cat
rm(cat)
comb$weight <- 0
cluster_cols <- dimnames(cluster)[[2]]
for (j in 1:nrow(comb)){
comb$weight[j] <- min(cluster[cluster_cols == comb$source[j]],
cluster[cluster_cols == comb$target[j]]) + comb$weight[j]
}
rm(j)
rm(cluster_cols)
if (i == 1){
final <- comb
rm(comb)
} else {
final <- rbind(final, comb)
rm(comb)
}
}
rm(i)
final <- final %>%
group_by(source, target, cat) %>%
summarize(weight = sum(weight)) %>%
ungroup()
输出:
# A tibble: 1 x 4
source target cat weight
<fctr> <fctr> <chr> <dbl>
1 escherichia coli mycobacterium tuberculosis undefined 2