绘制簇矩阵

Plot cluster matrix

我想使用以下 pandas 数据框从 scikit-learn 的 K-means 绘制一个聚类矩阵:

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer() # toy dataset
data = pd.DataFrame(cancer.data, columns=[cancer.feature_names])
df = data.iloc[:,4:8] #select subset
df.columns = ['smoothness', 'compactness', 'concavity', 'concave points'] 
df

+----+--------------+---------------+-------------+------------------+
|    |   smoothness |   compactness |   concavity |   concave points |
|----+--------------+---------------+-------------+------------------|
|  0 |      0.1184  |       0.2776  |      0.3001 |          0.1471  |
|  1 |      0.08474 |       0.07864 |      0.0869 |          0.07017 |
|  2 |      0.1096  |       0.1599  |      0.1974 |          0.1279  |
|  3 |      0.1425  |       0.2839  |      0.2414 |          0.1052  |
|  4 |      0.1003  |       0.1328  |      0.198  |          0.1043  |
+----+--------------+---------------+-------------+------------------+

您可以使用:

def kmeans_scatterplot(df, n_clusters):
    axs_length = len(df.columns) 
    fig, axs = plt.subplots(axs_length, axs_length, figsize=(20,20))

    for i, column_i in enumerate(df):
        for j, column_j in enumerate(df):

            # create plot
            if column_i != column_j:
                df_temp = df[[column_i, column_j]]
                km = KMeans(init='k-means++', n_clusters=n_clusters)
                km_clustering = km.fit(df_temp)
                axs[i][j].scatter(df_temp[column_i], df_temp[column_j], c=km_clustering.labels_, cmap='rainbow', alpha=0.7, edgecolors='b')

            # only show left and bottom lables
            if i == axs_length - 1:
                axs[i][j].set_xlabel(column_j)
            if j == 0:
                axs[i][j].set_ylabel(column_i)

kmeans_scatterplot(df, 2)

结果:

IIUC,您可以使用 seaborn.pairplot 简化并传入 Kmeans.label_ 作为 hue 参数。例如:

import seaborn as sns
from sklearn.cluster import KMeans

def kmeans_scatterplot(df, n_clusters):
    km = KMeans(init='k-means++', n_clusters=n_clusters)
    km_clustering = km.fit(df)
    sns.pairplot(df.assign(hue=km_clustering.labels_), hue='hue')

kmeans_scatterplot(df, 2)

[出局]