在 R Studio 中将 Presence/Absence 矩阵转换为聚类分析
Turning a Presence/Absence Matrix into a Cluster Analysis in R Studio
好的,我有一个包含 6 个样本的 presence/absence 矩阵,其中有 presence/absence 的 25 种可能性。
我已经能够用数据制作聚类树状图,但我更愿意将其绘制为看起来更好且更易于分析的距离矩阵? (也许是聚类图或类似的东西?)
我真的很难弄清楚下一部分 - 我花了几天时间在这里搜索和其他各种 Google 搜索,但没有任何结果!
这是我为聚类树状图获得的代码:
matrix<-read.csv("Horizontal.csv")
distance<-dist(matrix)
hc.m<-hclust(distance)
plot(hc.m, labels=matrix$Sample, main ="", cex.main=0.8, cex.lab= 1.1)
求助!
> dput(head(matrix,20))structure(list(Sample = structure(1:6, .Label = c("CL1", "CL2",
"CL3", "COL1", "COL2", "COL3"), class = "factor"), X = c(0L,
0L, 0L, 1L, 1L, 1L), X.1 = c(1L, 0L, 0L, 1L, 1L, 1L), X.2 = c(1L,
1L, 1L, 0L, 0L, 0L), X.3 = c(1L, 1L, 1L, 1L, 1L, 1L), X.4 = c(1L,
1L, 1L, 0L, 0L, 0L), X.5 = c(0L, 0L, 0L, 1L, 1L, 0L), X.6 = c(1L,
1L, 1L, 1L, 1L, 1L), X.7 = c(1L, 1L, 1L, 1L, 1L, 1L), X.8 = c(0L,
0L, 0L, 1L, 1L, 1L), X.9 = c(0L, 0L, 0L, 1L, 1L, 1L), X.10 = c(1L,
1L, 1L, 1L, 1L, 1L), X.11 = c(1L, 1L, 1L, 1L, 1L, 1L), X.12 = c(1L,
1L, 1L, 1L, 1L, 1L), X.13 = c(1L, 0L, 0L, 0L, 0L, 0L), X.14 = c(0L,
0L, 0L, 1L, 1L, 1L), X.15 = c(0L, 0L, 0L, 1L, 1L, 1L), X.16 = c(1L,
1L, 1L, 1L, 0L, 0L), X.17 = c(1L, 1L, 1L, 1L, 1L, 1L), X.18 = c(1L,
1L, 1L, 1L, 1L, 1L), X.19 = c(1L, 1L, 1L, 1L, 1L, 1L), X.20 = c(1L,
1L, 1L, 1L, 1L, 1L), X.21 = c(1L, 1L, 1L, 1L, 0L, 0L), X.22 = c(0L,
0L, 0L, 0L, 1L, 1L), X.23 = c(1L, 1L, 1L, 1L, 1L, 1L), X.24 = c(0L,
1L, 1L, 1L, 1L, 1L)), .Names = c("Sample", "X", "X.1", "X.2",
"X.3", "X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10", "X.11",
"X.12", "X.13", "X.14", "X.15", "X.16", "X.17", "X.18", "X.19",
"X.20", "X.21", "X.22", "X.23", "X.24"), row.names = c(NA, 6L
), class = "data.frame")
可以使用此代码:
library(vegan)
library(ggplot2)
library(tidyverse)
library(MASS)
#set working directory
setwd("~/Documents/Masters/BS707/Metagenomics")
#read csv file
cookie<-read.csv("Horizontal.csv")
data.frame(cookie, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
df = subset(cookie)
data.frame(df, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
dm<- dist(df, method = "binary") #calculate the distance matrix
cmdscale(dm, eig = TRUE, k=2) -> mds
as.tibble(mds$points) #mds coordinates
bind_cols(df, Sample = df$Sample) #bind sample names
mutate(df,group = gsub("\d$", "", "Sample1"))#remove last digit from sample names to form groups
ggplot(df)+
geom_point (aes(x = "V1",y = "V2", color = "group")) #plot
as.tibble(mds$points) %>% ggplot() + geom_point (aes(x = V1, y = V2))
我得到了情节,但每个组都被命名为 'Sample' 而不是 CL1、CL2、CL3、COL1、COL2、COL3。我不得不删除 %>% 因为我的 R 没有将它识别为命令或任何东西并且每次都给出错误(切换到 + 或删除然后它工作正常)。
这是一种在二维中可视化数据的方法:
library(tidyverse)
df %>%
dplyr::select(-1) %>% #remove first column
dist(method = "binary") %>% #calculate the distance matrix
cmdscale(eig = TRUE, k = 2) -> mds #do MDS also known as principal coordinates analysis
as.tibble(mds$points) %>% #mds coordinates
bind_cols( Sample = df$Sample) %>% #bind sample names
mutate(group = gsub("\d$", "", Sample)) %>% #remove last digit from sample names to form groups
ggplot()+
geom_point(aes(x = V1,y = V2, color = group)) #plot
或没有 tidyverse:
df_dist <- dist(df[,-1], method = "binary")
mds <- cmdscale(df_dist, eig = TRUE, k = 2)
for_plot <- data.frame(mds$points, group = gsub("\d$", "", df$Sample))
ggplot(for_plot)+
geom_point(aes(x = X1,y = X2, color = group))
其他选项包括使用 MASS
库中的 isoMDS
执行 Kruskal 的 Non-metric 多维缩放,或使用 vegan
库中的 metaMDS
执行非度量多维缩放具有随机开始、轴缩放和物种得分的稳定解决方案。
好的,我有一个包含 6 个样本的 presence/absence 矩阵,其中有 presence/absence 的 25 种可能性。
我已经能够用数据制作聚类树状图,但我更愿意将其绘制为看起来更好且更易于分析的距离矩阵? (也许是聚类图或类似的东西?)
我真的很难弄清楚下一部分 - 我花了几天时间在这里搜索和其他各种 Google 搜索,但没有任何结果!
这是我为聚类树状图获得的代码:
matrix<-read.csv("Horizontal.csv")
distance<-dist(matrix)
hc.m<-hclust(distance)
plot(hc.m, labels=matrix$Sample, main ="", cex.main=0.8, cex.lab= 1.1)
求助!
> dput(head(matrix,20))structure(list(Sample = structure(1:6, .Label = c("CL1", "CL2",
"CL3", "COL1", "COL2", "COL3"), class = "factor"), X = c(0L,
0L, 0L, 1L, 1L, 1L), X.1 = c(1L, 0L, 0L, 1L, 1L, 1L), X.2 = c(1L,
1L, 1L, 0L, 0L, 0L), X.3 = c(1L, 1L, 1L, 1L, 1L, 1L), X.4 = c(1L,
1L, 1L, 0L, 0L, 0L), X.5 = c(0L, 0L, 0L, 1L, 1L, 0L), X.6 = c(1L,
1L, 1L, 1L, 1L, 1L), X.7 = c(1L, 1L, 1L, 1L, 1L, 1L), X.8 = c(0L,
0L, 0L, 1L, 1L, 1L), X.9 = c(0L, 0L, 0L, 1L, 1L, 1L), X.10 = c(1L,
1L, 1L, 1L, 1L, 1L), X.11 = c(1L, 1L, 1L, 1L, 1L, 1L), X.12 = c(1L,
1L, 1L, 1L, 1L, 1L), X.13 = c(1L, 0L, 0L, 0L, 0L, 0L), X.14 = c(0L,
0L, 0L, 1L, 1L, 1L), X.15 = c(0L, 0L, 0L, 1L, 1L, 1L), X.16 = c(1L,
1L, 1L, 1L, 0L, 0L), X.17 = c(1L, 1L, 1L, 1L, 1L, 1L), X.18 = c(1L,
1L, 1L, 1L, 1L, 1L), X.19 = c(1L, 1L, 1L, 1L, 1L, 1L), X.20 = c(1L,
1L, 1L, 1L, 1L, 1L), X.21 = c(1L, 1L, 1L, 1L, 0L, 0L), X.22 = c(0L,
0L, 0L, 0L, 1L, 1L), X.23 = c(1L, 1L, 1L, 1L, 1L, 1L), X.24 = c(0L,
1L, 1L, 1L, 1L, 1L)), .Names = c("Sample", "X", "X.1", "X.2",
"X.3", "X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10", "X.11",
"X.12", "X.13", "X.14", "X.15", "X.16", "X.17", "X.18", "X.19",
"X.20", "X.21", "X.22", "X.23", "X.24"), row.names = c(NA, 6L
), class = "data.frame")
可以使用此代码:
library(vegan)
library(ggplot2)
library(tidyverse)
library(MASS)
#set working directory
setwd("~/Documents/Masters/BS707/Metagenomics")
#read csv file
cookie<-read.csv("Horizontal.csv")
data.frame(cookie, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
df = subset(cookie)
data.frame(df, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
dm<- dist(df, method = "binary") #calculate the distance matrix
cmdscale(dm, eig = TRUE, k=2) -> mds
as.tibble(mds$points) #mds coordinates
bind_cols(df, Sample = df$Sample) #bind sample names
mutate(df,group = gsub("\d$", "", "Sample1"))#remove last digit from sample names to form groups
ggplot(df)+
geom_point (aes(x = "V1",y = "V2", color = "group")) #plot
as.tibble(mds$points) %>% ggplot() + geom_point (aes(x = V1, y = V2))
我得到了情节,但每个组都被命名为 'Sample' 而不是 CL1、CL2、CL3、COL1、COL2、COL3。我不得不删除 %>% 因为我的 R 没有将它识别为命令或任何东西并且每次都给出错误(切换到 + 或删除然后它工作正常)。
这是一种在二维中可视化数据的方法:
library(tidyverse)
df %>%
dplyr::select(-1) %>% #remove first column
dist(method = "binary") %>% #calculate the distance matrix
cmdscale(eig = TRUE, k = 2) -> mds #do MDS also known as principal coordinates analysis
as.tibble(mds$points) %>% #mds coordinates
bind_cols( Sample = df$Sample) %>% #bind sample names
mutate(group = gsub("\d$", "", Sample)) %>% #remove last digit from sample names to form groups
ggplot()+
geom_point(aes(x = V1,y = V2, color = group)) #plot
或没有 tidyverse:
df_dist <- dist(df[,-1], method = "binary")
mds <- cmdscale(df_dist, eig = TRUE, k = 2)
for_plot <- data.frame(mds$points, group = gsub("\d$", "", df$Sample))
ggplot(for_plot)+
geom_point(aes(x = X1,y = X2, color = group))
其他选项包括使用 MASS
库中的 isoMDS
执行 Kruskal 的 Non-metric 多维缩放,或使用 vegan
库中的 metaMDS
执行非度量多维缩放具有随机开始、轴缩放和物种得分的稳定解决方案。