按方向和大小的集群组 - Python

Question

我希望使用 python 根据方向和大小对向量进行聚类。我发现使用 R 的示例有限，但 none 用于 python。不要与散点的标准 k 均值混淆，我实际上是在尝试对整个向量进行聚类。

下面取两组xy点生成一个向量。然后我希望根据长度和方向对这些向量进行聚类。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'

fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)

A = df['A']
B = df['B']

C = df['C']
D = df['D']

ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.5) 

X_1 = np.array(df[['A','B','C','D']])

model = KMeans(n_clusters = 20)
model.fit(X_1)

cluster_labels = model.predict(X_1)
df['n_cluster'] = cluster_labels
centroids_1 = pd.DataFrame(data = model.cluster_centers_, columns = ['start_x', 'start_y', 'end_x', 'end_y'])
cc = model.cluster_centers_

a = cc[:, 0]
b = cc[:, 1]
c = cc[:, 2]
d = cc[:, 3]

lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.8)

下图显示示例

Answer 1

这个怎么样:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import hdbscan

df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'

A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive

clusterer = hdbscan.HDBSCAN()

df['LENGTH'] = np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B))
df['DIRECTION'] = np.degrees(np.arctan2(df.D-df.B, df.C-df.A))


coords = df[['LENGTH', 'DIRECTION']].values
clusterer.fit_predict(coords)

cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
        [(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
        columns=["points", "weight"]
        )

colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
    df_this_cluster = pd.DataFrame(cluster, columns=['LENGTH', 'DIRECTION'])
    df_this_cluster['TEMP'] = x
    df = df.merge(df_this_cluster, on=['LENGTH', 'DIRECTION'], how='left')
    ix = df[df.TEMP.notnull()].index
    df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
    df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')

fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)

ax.quiver(df.A, df.B, (df.C-df.A), (df.D-df.B), angles='xy', scale_units='xy', scale=1, alpha=0.5, color=df.COLOR)

这将使用基于长度和方向的聚类（方向被转换为度数，弧度的小范围与我第一次尝试时的模型不太匹配）。

我不认为这将是一个非常“笛卡尔”的解决方案，因为模型中正在分析的两个值不在相同的指标中......但视觉效果还不错......

我确实尝试了另一种基于4个坐标的匹配，更加严谨。但它（非常符合预期）按 space（如果有的话）的子区域对向量进行聚类：

coords = df[['A', 'B', 'C', 'D']].values
clusterer.fit_predict(coords)

cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
        [(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
        columns=["points", "weight"]
        )

colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
    df_this_cluster = pd.DataFrame(cluster, columns=['A', 'B', 'C', 'D'])
    df_this_cluster['TEMP'] = x
    df = df.merge(df_this_cluster, on=['A', 'B', 'C', 'D'], how='left')
    ix = df[df.TEMP.notnull()].index
    df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
    df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')

编辑

我又试了一次，基于（非常好的）建议，考虑到 0/2pi 附近存在不连续性，角度不是一个好的变量；所以我选择同时使用 sinuses 和 cosinuses。我还缩放了长度（使 3 个变量具有匹配的比例）：

所以结果将是：

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import robust_scale
import hdbscan

df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'


A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()


df['LENGTH'] = robust_scale(np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B)))
df['DIRECTION'] = np.arctan2(df.D-df.B, df.C-df.A)
df['COS'] = np.cos(df['DIRECTION'])
df['SIN'] = np.sin(df['DIRECTION'])


columns = ['LENGTH', 'COS', 'SIN']

clusterer = hdbscan.HDBSCAN()
values = df[columns].values
clusterer.fit_predict(values)

cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
        [(values[cluster_labels==n], len(values[cluster_labels==n])) for n in range(num_clusters)],
        columns=["points", "weight"]
        )


def get_cmap(n, name='hsv'):
    '''
    Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.
    
    Credits to @Ali
    
    '''
    return plt.cm.get_cmap(name, n)

cmap = get_cmap(num_clusters+1)
colors = {x:cmap(x) for x in range(num_clusters)}
df['CLUSTER'] = np.nan


for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
    df_this_cluster = pd.DataFrame(cluster, columns=columns)
    df_this_cluster['TEMP'] = x
    df = df.merge(df_this_cluster, on=columns, how='left')
    df.reset_index(drop=True, inplace=True)
    
    ix = df[df.TEMP.notnull()].index
    df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
    df.drop("TEMP", axis=1, inplace=True)
    
df['CLUSTER'] = df['CLUSTER'].fillna(num_clusters-1)
df['COLOR'] = df['CLUSTER'].map(colors)
print("Number of clusters : ", num_clusters-1)

nrows = num_clusters//2 if num_clusters%2==0 else num_clusters//2 + 1
fig,axes = plt.subplots(nrows=nrows, ncols=2)
axes = [y for row in axes for y in row]
for k,ax in enumerate(axes):

    ax.set_xlim(-5, 25)
    ax.set_ylim(-5, 25)
    ax.set_aspect('equal', adjustable='box')
    if k+1 <num_clusters:
        ax.set_title(f"CLUSTER #{k+1}", fontsize=10)
    this_df = df[df.CLUSTER==k]
    ax.quiver(
        this_df.A, #X
        this_df.B, #Y
        (this_df.C-this_df.A), #X component of vector
        (this_df.D-this_df.B), #Y component of vector
        angles = 'xy', 
        scale_units = 'xy', 
        scale = 1, 
        color=this_df.COLOR
        )

结果要好得多（尽管它很大程度上取决于输入数据集）；最后一个子图是指未发现在簇内的向量：

编辑#2

如果“方向”是指 [0..pi[ 区间内的角度（即无向向量），则在计算 cosinuses/sinuses 之前需要包含以下代码：

ix = df[df.DIRECTION<0].index
df.loc[ix, "DIRECTION"] += np.pi

Answer 2

也许您还可以通过使用此函数将归一化向量投影到两个单位向量 (1,0) 和 (0,1) 上来对角度（除向量范数之外）进行聚类。直接处理投影（本质上是角度），我们不会因余弦函数的周期性而陷入困境

def get_norm_and_angle(e1):

    e1_norm = np.linalg.norm(e1,axis=1)
    e1 = e1 / e1_norm[:,None]
    e2 = np.array([1,0])
    e3 = np.array([0,1])
    
    return np.stack((e1_norm,e1@e2,e1@e3),axis=1)

基于此函数，这是一种可能的解决方案，它对我们要查找的聚类数量没有限制。在下面的脚本中，五个特征用于聚类

向量范数
x 轴和 y 轴上的向量投影
矢量起点

具备这五个特点

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.cluster import KMeans


def get_norm_and_angle(e1):

    e1_norm = np.linalg.norm(e1,axis=1)
    e1 = e1 / e1_norm[:,None]
    e2 = np.array([1,0])
    e3 = np.array([0,1])
    
    return np.stack((e1_norm,e1@e2,e1@e3),axis=1)


data = np.cumsum(np.random.randint(0,10,size=(50, 4)),axis=0)
df = pd.DataFrame(data, columns=list('ABCD'))

A = df['A'];B = df['B']
C = df['C'];D = df['D']

starting_points = np.stack((A,B),axis=1)
vectors = np.stack((C,D),axis=1) - np.stack((A,B),axis=1)
different_view = get_norm_and_angle(vectors)
different_view = np.hstack((different_view,starting_points))

num_clusters = 8
model = KMeans(n_clusters=num_clusters)
model.fit(different_view)

cluster_labels = model.predict(different_view)
df['n_cluster'] = cluster_labels
cluster_centers = model.cluster_centers_
cluster_offsets = cluster_centers[:,0][:,None] * cluster_centers[:,1:3]
cluster_starts = np.vstack([np.mean(starting_points[cluster_labels==ind],axis=0) for ind in range(num_clusters)])
main_streams = np.hstack((cluster_starts,cluster_starts+cluster_offsets))
a,b,c,d = main_streams.T

fig,ax = plt.subplots(figsize=(8,8))
ax.set_xlim(-np.max(data)*.1,np.max(data)*1.1)
ax.set_ylim(-np.max(data)*.1,np.max(data)*1.1)

colors = sns.color_palette(n_colors=num_clusters)
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', color = colors, scale = 1, alpha = 0.8, zorder=100)
lc2 = ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = .6, alpha = 0.2)

start_colors = [colors[ind] for ind in cluster_labels]
ax.scatter(starting_points[:,0],starting_points[:,1],c=start_colors)

plt.show()

样本输出是

如图所示，起点相近的向量聚类到同一组。

按方向和大小的集群组 - Python

Cluster groups by direction and magnitude - Python

python

cluster-analysis

scipy