如何在 python 中实施 EM-GMM?

How can implement EM-GMM in python?

我实施EM algorithm for GMM using this post GMMs and Maximum Likelihood Optimization Using NumPy失败如下:

import numpy as np

def PDF(data, means, variances):
    return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))

def EM_GMM(data, k, iterations):
    weights = np.ones((k, 1)) / k # shape=(k, 1)
    means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1)
    variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)

    data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n)

    for step in range(iterations):
        # Expectation step
        likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n)

        # Maximization step
        b = likelihood * weights # shape=(k, n)
        b /= np.sum(b, axis=1)[:, np.newaxis] + eps

        # updage means, variances, and weights
        means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
        variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
        weights = np.mean(b, axis=1)[:, np.newaxis]
        
    return means, variances

当我 运行 一维时间序列数据集上的算法时,k 等于 3,它 returns 输出如下:

array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    3.05053810e-003, 2.36989898e-025, 2.36989898e-025,
    1.32797395e-136, 6.91134950e-031, 5.47347807e-001,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 2.25849208e-064, 0.00000000e+000,
    1.61228562e-303, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 3.94387272e-242,
    1.13078186e+000, 2.53108878e-001, 5.33548114e-001,
    9.14920432e-001, 2.07015697e-013, 4.45250680e-038,
    1.43000602e+000, 1.28781615e+000, 1.44821615e+000,
    1.18186109e+000, 3.21610659e-002, 3.21610659e-002,
    3.21610659e-002, 3.21610659e-002, 3.21610659e-002,
    2.47382844e-039, 0.00000000e+000, 2.09150855e-200,
    0.00000000e+000, 0.00000000e+000],
   [5.93203066e-002, 1.01647068e+000, 5.99299162e-001,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 2.14690238e-010,
    2.49337135e-191, 5.10499986e-001, 9.32658804e-001,
    1.21148135e+000, 1.13315278e+000, 2.50324069e-237,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 1.73966953e-125, 2.53559290e-275,
    1.42960975e-065, 7.57552338e-001],
   [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
    3.05053810e-003, 2.36989898e-025, 2.36989898e-025,
    1.32797395e-136, 6.91134950e-031, 5.47347807e-001,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 1.44637007e+000, 1.44637007e+000,
    1.44637007e+000, 2.25849208e-064, 0.00000000e+000,
    1.61228562e-303, 0.00000000e+000, 0.00000000e+000,
    0.00000000e+000, 0.00000000e+000, 3.94387272e-242,
    1.13078186e+000, 2.53108878e-001, 5.33548114e-001,
    9.14920432e-001, 2.07015697e-013, 4.45250680e-038,
    1.43000602e+000, 1.28781615e+000, 1.44821615e+000,
    1.18186109e+000, 3.21610659e-002, 3.21610659e-002,
    3.21610659e-002, 3.21610659e-002, 3.21610659e-002,
    2.47382844e-039, 0.00000000e+000, 2.09150855e-200,
    0.00000000e+000, 0.00000000e+000]])

我认为这是错误的,因为输出是两个向量,其中一个代表 means 值,另一个代表 variances 值。使我对实现产生怀疑的模糊点是它 returns 返回 0.00000000e+000 大多数输出​​,因为它可以看到并且不需要真正可视化这些输出。顺便说一句,输入数据是 时间序列 数据。我已经检查了所有内容并进行了多次跟踪,但没有发现错误。

这是我的输入数据:

[25.31      , 24.31      , 24.12      , 43.46      , 41.48666667,
   41.48666667, 37.54      , 41.175     , 44.81      , 44.44571429,
   44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
   44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
   44.44571429, 44.44571429, 39.71      , 26.69      , 34.15      ,
   24.94      , 24.75      , 24.56      , 24.38      , 35.25      ,
   44.62      , 44.94      , 44.815     , 44.69      , 42.31      ,
   40.81      , 44.38      , 44.56      , 44.44      , 44.25      ,
   43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
   40.75      , 32.31      , 36.08      , 30.135     , 24.19      ]

我想知道是否有一种优雅的方法可以通过 numpySciKit-learn 来实现它。任何帮助将不胜感激。

更新 以下是当前输出和预期输出:

正如我在评论中提到的,我看到的关键点是 means 初始化。按照 sklearn Gaussian Mixture 的默认实现,我没有随机初始化,而是切换到 KMeans。

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

eps=1e-8 

def PDF(data, means, variances):
    return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))

def EM_GMM(data, k=3, iterations=100, init_strategy='kmeans'):
    weights = np.ones((k, 1)) / k # shape=(k, 1)
    
    if init_strategy=='kmeans':
        from sklearn.cluster import KMeans
        
        km = KMeans(k).fit(data[:, None])
        means = km.cluster_centers_ # shape=(k, 1)
        
    else: # init_strategy=='random'
        means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1)
    
    variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)

    data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n)

    for step in range(iterations):
        # Expectation step
        likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n)

        # Maximization step
        b = likelihood * weights # shape=(k, n)
        b /= np.sum(b, axis=1)[:, np.newaxis] + eps

        # updage means, variances, and weights
        means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
        variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
        weights = np.mean(b, axis=1)[:, np.newaxis]
        
    return means, variances

这似乎更一致地产生了所需的输出:

s = np.array([25.31      , 24.31      , 24.12      , 43.46      , 41.48666667,
              41.48666667, 37.54      , 41.175     , 44.81      , 44.44571429,
              44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
              44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
              44.44571429, 44.44571429, 39.71      , 26.69      , 34.15      ,
              24.94      , 24.75      , 24.56      , 24.38      , 35.25      ,
              44.62      , 44.94      , 44.815     , 44.69      , 42.31      ,
              40.81      , 44.38      , 44.56      , 44.44      , 44.25      ,
              43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
              40.75      , 32.31      , 36.08      , 30.135     , 24.19      ])
k=3
n_iter=100

means, variances = EM_GMM(s, k, n_iter)
print(means,variances)
[[44.42596231]
 [24.509301  ]
 [35.4137508 ]] 
[[0.07568723]
 [0.10583743]
 [0.52125856]]

# Plotting the results
colors = ['green', 'red', 'blue', 'yellow']
bins = np.linspace(np.min(s)-2, np.max(s)+2, 100)

plt.figure(figsize=(10,7))
plt.xlabel('$x$')
plt.ylabel('pdf')

sns.scatterplot(s, [0.05] * len(s), color='navy', s=40, marker=2, label='Series data')

for i, (m, v) in enumerate(zip(means, variances)):
    sns.lineplot(bins, PDF(bins, m, v), color=colors[i], label=f'Cluster {i+1}')

plt.legend()
plt.plot()

最后我们可以看到纯随机初始化产生了不同的结果;让我们看看结果 means:

for _ in range(5):
    print(EM_GMM(s, k, n_iter, init_strategy='random')[0], '\n')

[[44.42596231]
 [44.42596231]
 [44.42596231]]

[[44.42596231]
 [24.509301  ]
 [30.1349997 ]]

[[44.42596231]
 [35.4137508 ]
 [44.42596231]]

[[44.42596231]
 [30.1349997 ]
 [44.42596231]]

[[44.42596231]
 [44.42596231]
 [44.42596231]]

可以看出这些结果有多么不同,在某些情况下,结果均值是恒定的,这意味着初始化选择了 3 个相似的值并且在迭代时没有太大变化。在 EM_GMM 中添加一些打印语句将澄清这一点。

# Expectation step
likelihood = PDF(data, means, np.sqrt(variances))
  • 为什么我们要超过 sqrtvariances? pdf 函数接受差异。所以这应该是 PDF(data, means, variances).

另一个问题,

# Maximization step
b = likelihood * weights # shape=(k, n)
b /= np.sum(b, axis=1)[:, np.newaxis] + eps
  • 上面第二行应该是b /= np.sum(b, axis=0)[:, np.newaxis] + eps

也在variances,

的初始化
variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)
  • 为什么我们要随机初始化方差?我们有 datameans,为什么不像 vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1) 那样计算当前估计方差?

经过这些更改,这是我的实现,

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

eps=1e-8


def pdf(data, means, vars):
    denom = np.sqrt(2 * np.pi * vars) + eps
    numer = np.exp(-0.5 * np.square(data - means) / (vars + eps))
    return numer /denom


def em_gmm(data, k, n_iter, init_strategy='k_means'):
    weights = np.ones((k, 1), dtype=np.float32) / k
    if init_strategy == 'k_means':
        from sklearn.cluster import KMeans
        km = KMeans(k).fit(data[:, None])
        means = km.cluster_centers_
    else:
        means = np.random.choice(data, k)[:, np.newaxis]
    data = np.repeat(data[np.newaxis, :], k, 0)
    vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)
    for step in range(n_iter):
        p = pdf(data, means, vars)
        b = p * weights
        denom = np.expand_dims(np.sum(b, axis=0), 0) + eps
        b = b / denom
        means_n = np.sum(b * data, axis=1)
        means_d = np.sum(b, axis=1) + eps
        means = np.expand_dims(means_n / means_d, -1)
        vars = np.sum(b * np.square(data - means), axis=1) / means_d
        vars = np.expand_dims(vars, -1)
        weights = np.expand_dims(np.mean(b, axis=1), -1)

    return means, vars


def main():
    s = np.array([25.31, 24.31, 24.12, 43.46, 41.48666667,
                  41.48666667, 37.54, 41.175, 44.81, 44.44571429,
                  44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
                  44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
                  44.44571429, 44.44571429, 39.71, 26.69, 34.15,
                  24.94, 24.75, 24.56, 24.38, 35.25,
                  44.62, 44.94, 44.815, 44.69, 42.31,
                  40.81, 44.38, 44.56, 44.44, 44.25,
                  43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
                  40.75, 32.31, 36.08, 30.135, 24.19])
    k = 3
    n_iter = 100

    means, vars = em_gmm(s, k, n_iter)
    y = 0
    colors = ['green', 'red', 'blue', 'yellow']
    bins = np.linspace(np.min(s) - 2, np.max(s) + 2, 100)
    plt.figure(figsize=(10, 7))
    plt.xlabel('$x$')
    plt.ylabel('pdf')
    sns.scatterplot(s, [0.0] * len(s), color='navy', s=40, marker=2, label='Series data')
    for i, (m, v) in enumerate(zip(means, vars)):
        sns.lineplot(bins, pdf(bins, m, v), color=colors[i], label=f'Cluster {i + 1}')
    plt.legend()
    plt.plot()

    plt.show()
    pass

这是我的结果。