如何绘制聚类中心?

How to plot the cluster centers?

通过从头开始使用这个 k-means 聚类,我如何在照片的散点图上绘制 k=3 的初始随机聚类中心? Photo 对于 Iris 数据集

import numpy as np
from scipy.spatial.distance import cdist 
 
def kmeans(x,k, no_of_iterations):
    idx = np.random.choice(len(x), k, replace=False)
    #Randomly choosing Centroids 
    centroids = x[idx, :]
     
    #finding the distance between centroids and all the data points
    distances = cdist(x, centroids ,'euclidean')
     
    points = np.array([np.argmin(i) for i in distances])
     
    for _ in range(no_of_iterations): 
        centroids = []
        for idx in range(k):
            #Updating Centroids by taking mean of Cluster it belongs to
            temp_cent = x[points==idx].mean(axis=0) 
            centroids.append(temp_cent)
 
        centroids = np.vstack(centroids) #Updated Centroids 
         
        distances = cdist(x, centroids ,'euclidean')
        points = np.array([np.argmin(i) for i in distances])
         
    return points 

您可以通过 matplotlib 的 scatter 函数绘制点和中心。可以根据通过 kmeans.

计算的组分配颜色

这是一个示例(kmeans 函数现在也 return 质心)。

import matplotlib.pyplot as plt
import seaborn as sns  # for the iris dataset
import numpy as np
from scipy.spatial.distance import cdist

def kmeans(x, k, no_of_iterations=100):
    idx = np.random.choice(len(x), k, replace=False)
    # Randomly choosing Centroids
    centroids = x[idx, :]
    # finding the distance between centroids and all the data points
    distances = cdist(x, centroids, 'euclidean')
    points = np.array([np.argmin(i) for i in distances])

    for _ in range(no_of_iterations):
        centroids = []
        for idx in range(k):
            # Updating Centroids by taking mean of Cluster it belongs to
            temp_cent = x[points == idx].mean(axis=0)
            centroids.append(temp_cent)
        centroids = np.vstack(centroids)  # Updated Centroids
        distances = cdist(x, centroids, 'euclidean')
        points = np.array([np.argmin(i) for i in distances])
    return points, centroids

iris = sns.load_dataset('iris')
x = iris[['sepal_length', 'sepal_width']].to_numpy()

k = 3
points, centroids = kmeans(x, k)

colors = plt.cm.Set2.colors
for val, color in zip(range(k), colors):
    plt.scatter(centroids[val, 0], centroids[val, 1], facecolor='none', edgecolor=color, lw=3,
                s=100, label=f'centroid {val}')
for val, color in zip(range(k), colors):
    plt.scatter(x[points == val, 0], x[points == val, 1], color=color, label=f'set {val}')
plt.legend(ncol=2)
plt.show()

这里尝试显示给定的目标名称以及 kmeans 近似值。请注意,kmeans 值的顺序是随机的。较大的背景圆圈显示目标名称,较小的圆圈,很好地向其质心分组,来自 kmeans。

from sklearn.datasets import load_iris

iris_data = load_iris()
x = iris_data.data[:, :2]

color_givens = ['magenta', 'gold', 'cyan']
for val, (name, color) in enumerate(zip(iris_data.target_names, color_givens)):
    plt.scatter(x[iris_data.target == val, 0], x[iris_data.target == val, 1],
                color=color, s=150, alpha=0.6, label=f'given {name}')

k = 3
points, centroids = kmeans(x, k)
colors_kmeans = plt.cm.Set1.colors
for val, color in zip(range(k), colors_kmeans):
    plt.scatter(centroids[val, 0], centroids[val, 1], facecolor='none', edgecolor=color, lw=3,
                s=150, label=f'centroid {val}')
for val, color in zip(range(k), colors_kmeans):
    plt.scatter(x[points == val, 0], x[points == val, 1], color=color, label=f'set {val}')
plt.xlabel(iris_data.feature_names[0])
plt.ylabel(iris_data.feature_names[1])
plt.legend(ncol=3)
plt.show()