如何在执行 PCA 后在 Python 中显示散点图?
How do I show a scatter plot in Python after doing PCA?
我自己制作了一个随机数据,它包含一个 18 行 5 列的文本文件,其中包含所有整数条目。
我成功地完成了 PCA,但现在卡住了。我无法绘制散点图。这是我的代码:
f=open(r'<path>mydata.txt')
print(f.read()) #reading from a file
with open(r'<path>mydata.txt') as f:
emp= []
for line in f:
line = line.split()
if line:
line = [int(i) for i in line]
emp.append(line)
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
X = emp
pca = PCA(n_components=3, whiten=True).fit(X)
X_pca = pca.transform(X) #regular PCA
现在,PCA 完成并且我的方差已知,我该如何绘图?
我的数据集中的示例数据如下所示:
2 1 2 3 0
2 3 2 3 0
1 3 1 1 0
1 5 2 1 0
2 3 1 1 0
3 3 0 1 0
7 1 1 1 1
7 2 2 1 1
1 1 1 4 1
3 2 3 2 1
2 2 2 2 1
1 3 2 3 1
2 3 2 1 2
2 2 1 1 2
7 5 3 2 2
3 4 2 4 2
2 1 1 1 2
7 1 3 3 2
这是你要的吗?
import numpy as np
from matplotlib import pyplot as plt
data1 = [np.random.normal(0,0.1, 10), np.random.normal(0,0.1,10)]
data2 = [np.random.normal(1,0.2, 10), np.random.normal(2,0.3,10)]
data3 = [np.random.normal(-2,0.1, 10), np.random.normal(1,0.5,10)]
plt.scatter(data1[0],data1[1])
plt.scatter(data2[0],data2[1])
plt.scatter(data3[0],data3[1])
plt.show()
三个不同数据集的结果如下所示:
编辑:
希望我现在能更好地理解您的问题。这里是新代码:
import numpy as np
from matplotlib import pyplot as plt
with open(r'mydata.txt') as f:
emp= []
for line in f:
line = line.split()
if line:
line = [int(i) for i in line]
emp.append(line)
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
X = emp
pca = PCA(n_components=3, whiten=True).fit(X)
X_pca = pca.transform(X) #regular PCA
jobs = ['A', 'B', 'C']
job_id = np.array([e[4] for e in emp])
fig, axes = plt.subplots(3,3, figsize=(5,5))
for row in range(axes.shape[0]):
for col in range(axes.shape[1]):
ax = axes[row,col]
if row == col:
ax.tick_params(
axis='both',which='both',
bottom='off',top='off',
labelbottom='off',
left='off',right='off',
labelleft='off'
)
ax.text(0.5,0.5,jobs[row],horizontalalignment='center')
else:
ax.scatter(X_pca[:,row][job_id==0],X_pca[:,col][job_id==0],c='r')
ax.scatter(X_pca[:,row][job_id==1],X_pca[:,col][job_id==1],c='g')
ax.scatter(X_pca[:,row][job_id==2],X_pca[:,col][job_id==2],c='b')
fig.tight_layout()
plt.show()
我分别用 ID 0, 1, and 2
命名了作业 'A', 'B', and 'C'
。从 emp
的最后一行开始,我创建了一个 numpy
数组来保存这些索引。在关键的绘图命令中,我通过作业 ID 屏蔽了数据。希望这有帮助。
结果图如下所示:
编辑 2:
如果您只想要一个将 X_pca 的第一列和第二列相互关联的图,则代码变得更加简单:
import numpy as np
from matplotlib import pyplot as plt
with open(r'mydata.txt') as f:
emp= []
for line in f:
line = line.split()
if line:
line = [int(i) for i in line]
emp.append(line)
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
X = emp
pca = PCA(n_components=3, whiten=True).fit(X)
X_pca = pca.transform(X) #regular PCA
jobs = ['A', 'B', 'C']
job_id = np.array([e[4] for e in emp])
row = 0
col = 1
plt.scatter(X_pca[:,row][job_id==0],X_pca[:,col][job_id==0],c='r')
plt.scatter(X_pca[:,row][job_id==1],X_pca[:,col][job_id==1],c='g')
plt.scatter(X_pca[:,row][job_id==2],X_pca[:,col][job_id==2],c='b')
plt.show()
结果如下所示:
我强烈建议您阅读这些示例中使用的函数的文档。
根据您想要获得此内容的评论 (https://imgur.com/a/NJAzU),以下是使用 sklearn 库的方法:
在这个例子中我使用的是虹膜数据:
第 1 部分:仅绘制散点图
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from numpy import linalg as LA
import pandas as pd
from scipy import stats
iris = datasets.load_iris()
X = iris.data
y = iris.target
#In general a good idea is to scale the data
X = stats.zscore(X)
pca = PCA()
x_new = pca.fit_transform(X)
plt.scatter(x_new[:,0], x_new[:,1], c = y)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
结果 1
第 2 部分:如果您想绘制著名的双标图
#Create the biplot function
def biplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley, c = y)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
#Call the function. Use only the 2 PCs.
biplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
结果 2
我自己制作了一个随机数据,它包含一个 18 行 5 列的文本文件,其中包含所有整数条目。
我成功地完成了 PCA,但现在卡住了。我无法绘制散点图。这是我的代码:
f=open(r'<path>mydata.txt')
print(f.read()) #reading from a file
with open(r'<path>mydata.txt') as f:
emp= []
for line in f:
line = line.split()
if line:
line = [int(i) for i in line]
emp.append(line)
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
X = emp
pca = PCA(n_components=3, whiten=True).fit(X)
X_pca = pca.transform(X) #regular PCA
现在,PCA 完成并且我的方差已知,我该如何绘图?
我的数据集中的示例数据如下所示:
2 1 2 3 0
2 3 2 3 0
1 3 1 1 0
1 5 2 1 0
2 3 1 1 0
3 3 0 1 0
7 1 1 1 1
7 2 2 1 1
1 1 1 4 1
3 2 3 2 1
2 2 2 2 1
1 3 2 3 1
2 3 2 1 2
2 2 1 1 2
7 5 3 2 2
3 4 2 4 2
2 1 1 1 2
7 1 3 3 2
这是你要的吗?
import numpy as np
from matplotlib import pyplot as plt
data1 = [np.random.normal(0,0.1, 10), np.random.normal(0,0.1,10)]
data2 = [np.random.normal(1,0.2, 10), np.random.normal(2,0.3,10)]
data3 = [np.random.normal(-2,0.1, 10), np.random.normal(1,0.5,10)]
plt.scatter(data1[0],data1[1])
plt.scatter(data2[0],data2[1])
plt.scatter(data3[0],data3[1])
plt.show()
三个不同数据集的结果如下所示:
编辑:
希望我现在能更好地理解您的问题。这里是新代码:
import numpy as np
from matplotlib import pyplot as plt
with open(r'mydata.txt') as f:
emp= []
for line in f:
line = line.split()
if line:
line = [int(i) for i in line]
emp.append(line)
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
X = emp
pca = PCA(n_components=3, whiten=True).fit(X)
X_pca = pca.transform(X) #regular PCA
jobs = ['A', 'B', 'C']
job_id = np.array([e[4] for e in emp])
fig, axes = plt.subplots(3,3, figsize=(5,5))
for row in range(axes.shape[0]):
for col in range(axes.shape[1]):
ax = axes[row,col]
if row == col:
ax.tick_params(
axis='both',which='both',
bottom='off',top='off',
labelbottom='off',
left='off',right='off',
labelleft='off'
)
ax.text(0.5,0.5,jobs[row],horizontalalignment='center')
else:
ax.scatter(X_pca[:,row][job_id==0],X_pca[:,col][job_id==0],c='r')
ax.scatter(X_pca[:,row][job_id==1],X_pca[:,col][job_id==1],c='g')
ax.scatter(X_pca[:,row][job_id==2],X_pca[:,col][job_id==2],c='b')
fig.tight_layout()
plt.show()
我分别用 ID 0, 1, and 2
命名了作业 'A', 'B', and 'C'
。从 emp
的最后一行开始,我创建了一个 numpy
数组来保存这些索引。在关键的绘图命令中,我通过作业 ID 屏蔽了数据。希望这有帮助。
结果图如下所示:
编辑 2:
如果您只想要一个将 X_pca 的第一列和第二列相互关联的图,则代码变得更加简单:
import numpy as np
from matplotlib import pyplot as plt
with open(r'mydata.txt') as f:
emp= []
for line in f:
line = line.split()
if line:
line = [int(i) for i in line]
emp.append(line)
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle
X = emp
pca = PCA(n_components=3, whiten=True).fit(X)
X_pca = pca.transform(X) #regular PCA
jobs = ['A', 'B', 'C']
job_id = np.array([e[4] for e in emp])
row = 0
col = 1
plt.scatter(X_pca[:,row][job_id==0],X_pca[:,col][job_id==0],c='r')
plt.scatter(X_pca[:,row][job_id==1],X_pca[:,col][job_id==1],c='g')
plt.scatter(X_pca[:,row][job_id==2],X_pca[:,col][job_id==2],c='b')
plt.show()
结果如下所示:
我强烈建议您阅读这些示例中使用的函数的文档。
根据您想要获得此内容的评论 (https://imgur.com/a/NJAzU),以下是使用 sklearn 库的方法:
在这个例子中我使用的是虹膜数据:
第 1 部分:仅绘制散点图
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from numpy import linalg as LA
import pandas as pd
from scipy import stats
iris = datasets.load_iris()
X = iris.data
y = iris.target
#In general a good idea is to scale the data
X = stats.zscore(X)
pca = PCA()
x_new = pca.fit_transform(X)
plt.scatter(x_new[:,0], x_new[:,1], c = y)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
结果 1
第 2 部分:如果您想绘制著名的双标图
#Create the biplot function
def biplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley, c = y)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
#Call the function. Use only the 2 PCs.
biplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
结果 2