在 R Studio 中将 Presence/Absence 矩阵转换为聚类分析

Question

好的，我有一个包含 6 个样本的 presence/absence 矩阵，其中有 presence/absence 的 25 种可能性。

我已经能够用数据制作聚类树状图，但我更愿意将其绘制为看起来更好且更易于分析的距离矩阵？（也许是聚类图或类似的东西？）

我真的很难弄清楚下一部分 - 我花了几天时间在这里搜索和其他各种 Google 搜索，但没有任何结果！

这是我为聚类树状图获得的代码：

matrix<-read.csv("Horizontal.csv")
distance<-dist(matrix)
hc.m<-hclust(distance)
plot(hc.m, labels=matrix$Sample, main ="", cex.main=0.8, cex.lab= 1.1)

求助！

> dput(head(matrix,20))structure(list(Sample = structure(1:6, .Label =     c("CL1", "CL2", 
"CL3", "COL1", "COL2", "COL3"), class = "factor"), X = c(0L, 
0L, 0L, 1L, 1L, 1L), X.1 = c(1L, 0L, 0L, 1L, 1L, 1L), X.2 = c(1L, 
1L, 1L, 0L, 0L, 0L), X.3 = c(1L, 1L, 1L, 1L, 1L, 1L), X.4 = c(1L, 
1L, 1L, 0L, 0L, 0L), X.5 = c(0L, 0L, 0L, 1L, 1L, 0L), X.6 = c(1L, 
1L, 1L, 1L, 1L, 1L), X.7 = c(1L, 1L, 1L, 1L, 1L, 1L), X.8 = c(0L, 
0L, 0L, 1L, 1L, 1L), X.9 = c(0L, 0L, 0L, 1L, 1L, 1L), X.10 = c(1L, 
1L, 1L, 1L, 1L, 1L), X.11 = c(1L, 1L, 1L, 1L, 1L, 1L), X.12 = c(1L, 
1L, 1L, 1L, 1L, 1L), X.13 = c(1L, 0L, 0L, 0L, 0L, 0L), X.14 = c(0L, 
0L, 0L, 1L, 1L, 1L), X.15 = c(0L, 0L, 0L, 1L, 1L, 1L), X.16 = c(1L, 
1L, 1L, 1L, 0L, 0L), X.17 = c(1L, 1L, 1L, 1L, 1L, 1L), X.18 = c(1L, 
1L, 1L, 1L, 1L, 1L), X.19 = c(1L, 1L, 1L, 1L, 1L, 1L), X.20 = c(1L, 
1L, 1L, 1L, 1L, 1L), X.21 = c(1L, 1L, 1L, 1L, 0L, 0L), X.22 = c(0L, 
0L, 0L, 0L, 1L, 1L), X.23 = c(1L, 1L, 1L, 1L, 1L, 1L), X.24 = c(0L, 
1L, 1L, 1L, 1L, 1L)), .Names = c("Sample", "X", "X.1", "X.2", 
"X.3", "X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10", "X.11", 
"X.12", "X.13", "X.14", "X.15", "X.16", "X.17", "X.18", "X.19", 
"X.20", "X.21", "X.22", "X.23", "X.24"), row.names = c(NA, 6L
), class = "data.frame")

可以使用此代码：

library(vegan)
library(ggplot2)
library(tidyverse)
library(MASS)
#set working directory
setwd("~/Documents/Masters/BS707/Metagenomics")
#read csv file
cookie<-read.csv("Horizontal.csv")
data.frame(cookie, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
df = subset(cookie)
data.frame(df, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
dm<- dist(df, method = "binary")  #calculate the distance matrix
cmdscale(dm, eig = TRUE, k=2) -> mds
as.tibble(mds$points)  #mds coordinates
bind_cols(df, Sample = df$Sample)  #bind sample names  
mutate(df,group = gsub("\d$", "", "Sample1"))#remove last digit from   sample names to form groups
ggplot(df)+
 geom_point (aes(x = "V1",y = "V2", color = "group")) #plot
as.tibble(mds$points) %>% ggplot() + geom_point (aes(x = V1, y = V2))

我得到了情节，但每个组都被命名为 'Sample' 而不是 CL1、CL2、CL3、COL1、COL2、COL3。我不得不删除 %>% 因为我的 R 没有将它识别为命令或任何东西并且每次都给出错误（切换到 + 或删除然后它工作正常）。

Answer 1

这是一种在二维中可视化数据的方法：

library(tidyverse)

df %>%
  dplyr::select(-1) %>% #remove first column
  dist(method = "binary") %>% #calculate the distance matrix
  cmdscale(eig = TRUE, k = 2) -> mds #do MDS also known as principal coordinates analysis

as.tibble(mds$points) %>% #mds coordinates
  bind_cols( Sample = df$Sample) %>% #bind sample names
  mutate(group = gsub("\d$", "", Sample)) %>% #remove last digit from sample names to form groups
  ggplot()+
  geom_point(aes(x = V1,y = V2, color = group)) #plot

或没有 tidyverse:

df_dist <- dist(df[,-1], method = "binary") 
mds <- cmdscale(df_dist, eig = TRUE, k = 2) 

for_plot <- data.frame(mds$points, group = gsub("\d$", "", df$Sample))

ggplot(for_plot)+
  geom_point(aes(x = X1,y = X2, color = group))

其他选项包括使用 MASS 库中的 isoMDS 执行 Kruskal 的 Non-metric 多维缩放，或使用 vegan 库中的 metaMDS 执行非度量多维缩放具有随机开始、轴缩放和物种得分的稳定解决方案。

在 R Studio 中将 Presence/Absence 矩阵转换为聚类分析

Turning a Presence/Absence Matrix into a Cluster Analysis in R Studio

r

cluster-analysis

dendrogram