如何获取一维数据的簇?
How to get cluster for 1D data?
我有一个如下所示的 csv 文件
page Page_Value
page1 12
page2 1
page3 2
page4 3
page5 10
page6 11
page7 13
page8 67
page9 70
#covert page to numeric
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset_PV['page'])
dataset_PV['page'] = labelEncoder.transform(dataset_PV['page'])
#find out no. of cluster using elbow methos
from sklearn.cluster import KMeans
from sklearn import preprocessing
wcss = []
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(dataset_PV)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(1,10), wcss,marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('within-cluster sums of squares (WCSS)')
plt.show()
#fit model
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(dataset_PV)
dataset_PV['clusters'] = clusters
我想根据页面值创建页面组合以获得最大页面值。在这里,我使用了 K-means,我已将页面变量转换为数字。我不确定我是否应该使用 k-means 或对 Page_value 变量进行排序然后对它们进行分组(不确定代码)。
输出如下:
Cluster1 = page2,page3,page4
Cluster2 = page5,page6,page7,page12
Cluster3 = page7,page8,page9
谢谢
您不需要先对页面进行排序。您是否尝试过 Opencv K-mean?我希望它有所帮助。
[https://docs.opencv.org/master/d1/d5c/tutorial_py_kmeans_opencv.html]
import numpy as np
import cv2 as cv
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv.KMEANS_RANDOM_CENTERS
K = 3
# Apply KMeans
compactness, labels, centers = cv.kmeans(x, K, None, criteria, 10, flags)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
另一个使用 Sklearn 的解决方案:
from sklearn.cluster import KMeans
import numpy as np
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
K=3
km = KMeans(n_clusters=K)
km.fit(x)
labels = km.predict(x)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
您已经完成了大部分工作,但是页面名称不应包含在KMeans
的计算中,这没有任何意义。
即不需要 LabelEncoder
tl;博士
The short answers you can refer to @Sơn Ninh.
如果你想可视化我的回答可能对你有帮助。
我给你写了一个函数(label_encoding
),你可以用它来获取id的映射,有助于画图
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import textwrap
from io import StringIO
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from typing import Union, NamedTuple
from collections import defaultdict
def main():
df = import_file()
suitable_n: int
x = df.Page_Value.values.reshape(-1, 1)
if not 'Use the Elbow method to get a suitable N.':
# You can omit this if you don't want to see it at all.
elbow_find_n(x)
suitable_n = 3
# fit model
kmeans = KMeans(n_clusters=suitable_n, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(x)
# labelEncoder = LabelEncoder()
# labelEncoder.fit(df['page'])
# df['page'] = labelEncoder.transform(df['page'])
df.loc[:, ['page']], mapping_table = label_encoding(df.loc[:, ['page']])
df = rebuild_df(df, clusters, mapping_table) # 'page-id', 'page', 'Page_Value', 'clusters'
print(df)
dict_by_cluster_value = defaultdict(list)
for cur_set in set(df['clusters']): # Output the format that you define.
print(f'Cluster{cur_set} = {",".join(df.page[df.clusters == cur_set])}')
dict_by_cluster_value[cur_set].extend(df.page[df.clusters == cur_set].to_list())
print(dict(dict_by_cluster_value)) # it's ok with defaultdict, I convert the type is for print beautiful.
visualizing_the_clusters(kmeans, df)
class RGBColor(NamedTuple):
BLACK = '#000000'
# AZURE = '#F0FFFF'
OLIVE = '#808000'
PINK = '#FFC0CB'
# WHITE = '#000000' <-- not suitable put it on background is white.
GOLD = 'FFD700'
BLUE = '#0000FF'
GREEN = '#00FF00'
RED = '#FF0000'
YELLOW = '#FFFF00'
ORANGE = '#FFA500'
PURPLE = '#FF00FF'
def get_tuple(self):
return (attr_name for attr_name in dir(self) if not attr_name.startswith('_') and attr_name.isupper())
def label_encoding(label_col: Union[pd.DataFrame, np.ndarray], is_need_mapping_table=True) -> tuple:
"""
USAGE:
df.loc[:, ['col_xxx', ]], mapping_table = label_encoding(df.loc[:, ['col_xxx']])
"""
nda_rtn_value = LabelEncoder().fit_transform(label_col.values.ravel()) if isinstance(label_col, pd.DataFrame) else LabelEncoder().fit_transform(label_col)
rtn_dict = dict()
if is_need_mapping_table:
list_value = [e[0] for e in label_col.values] if isinstance(label_col, pd.DataFrame) else [e for e in label_col]
rtn_dict = dict(zip(nda_rtn_value, list_value))
if isinstance(label_col, pd.DataFrame):
nda_rtn_value = nda_rtn_value.reshape(-1, 1)
return nda_rtn_value, rtn_dict
def import_file() -> pd.DataFrame:
page_content = textwrap.dedent( # Remove any common leading whitespace from every line in text.
"""\
page,Page_Value
page1,12
page2,1
page3,2
page4,3
page5,10
page6,11
page7,13
page8,67
page9,70
"""
)
df = pd.read_csv(StringIO(page_content), header=0)
return df
def elbow_find_n(x):
wcss = []
for i in range(1, 10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15, 8))
plt.plot(range(1, 10), wcss, marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of Clusters')
plt.ylabel('within-cluster sums of squares WCSS')
plt.show()
def rebuild_df(df, clusters, mapping_table):
df['clusters'] = clusters
df.rename(columns={'page': 'page-id'}, inplace=True)
df['page'] = df.apply(lambda df_: mapping_table[df_['page-id']], axis=1)
df = df.reindex(['page-id', 'page', 'clusters', 'Page_Value', ], axis=1)
return df
def visualizing_the_clusters(kmeans: KMeans, df: pd.DataFrame):
standard_rgb = RGBColor()
# plt.scatter(df[df.clusters == 0]['page-id'], df[df.clusters == 0]['Page_Value'], s=2, c='red', label='Careful')
# plt.scatter(df[df.clusters == 2]['page-id'], df[df.clusters == 2]['Page_Value'], s=2, c='cyan', label='Careless')
# ...
for color_idx, (cur_set, color) in enumerate(
zip(set(df.clusters), standard_rgb.get_tuple())
):
contain_cluster_index = df.clusters == cur_set
plt.scatter(df[contain_cluster_index]['page-id'], df[contain_cluster_index]['Page_Value'],
s=2, c=color, label=f'Cluster{cur_set}: {kmeans.cluster_centers_[cur_set][0]}')
n_cluster = len(kmeans.cluster_centers_)
plt.scatter(np.ones(n_cluster)*(-1), kmeans.cluster_centers_[:, 0], s=(50/n_cluster), c='purple', label='Centroids')
plt.title('Page and Page_Value')
plt.xlabel('Page_ID')
plt.ylabel('Page_Value')
plt.legend(loc=(1.05, 0.5))
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()
输出
page-id page clusters Page_Value
0 0 page1 0 12
1 1 page2 2 1
2 2 page3 2 2
3 3 page4 2 3
4 4 page5 0 10
5 5 page6 0 11
6 6 page7 0 13
7 7 page8 1 67
8 8 page9 1 70
Cluster0 = page1,page5,page6,page7
Cluster1 = page8,page9
Cluster2 = page2,page3,page4
{0: ['page1', 'page5', 'page6', 'page7'], 1: ['page8', 'page9'], 2: ['page2', 'page3', 'page4']}
我有一个如下所示的 csv 文件
page Page_Value
page1 12
page2 1
page3 2
page4 3
page5 10
page6 11
page7 13
page8 67
page9 70
#covert page to numeric
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset_PV['page'])
dataset_PV['page'] = labelEncoder.transform(dataset_PV['page'])
#find out no. of cluster using elbow methos
from sklearn.cluster import KMeans
from sklearn import preprocessing
wcss = []
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(dataset_PV)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(1,10), wcss,marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('within-cluster sums of squares (WCSS)')
plt.show()
#fit model
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(dataset_PV)
dataset_PV['clusters'] = clusters
我想根据页面值创建页面组合以获得最大页面值。在这里,我使用了 K-means,我已将页面变量转换为数字。我不确定我是否应该使用 k-means 或对 Page_value 变量进行排序然后对它们进行分组(不确定代码)。
输出如下:
Cluster1 = page2,page3,page4
Cluster2 = page5,page6,page7,page12
Cluster3 = page7,page8,page9
谢谢
您不需要先对页面进行排序。您是否尝试过 Opencv K-mean?我希望它有所帮助。 [https://docs.opencv.org/master/d1/d5c/tutorial_py_kmeans_opencv.html]
import numpy as np
import cv2 as cv
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv.KMEANS_RANDOM_CENTERS
K = 3
# Apply KMeans
compactness, labels, centers = cv.kmeans(x, K, None, criteria, 10, flags)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
另一个使用 Sklearn 的解决方案:
from sklearn.cluster import KMeans
import numpy as np
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
K=3
km = KMeans(n_clusters=K)
km.fit(x)
labels = km.predict(x)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
您已经完成了大部分工作,但是页面名称不应包含在KMeans
的计算中,这没有任何意义。
即不需要 LabelEncoder
tl;博士
The short answers you can refer to @Sơn Ninh.
如果你想可视化我的回答可能对你有帮助。
我给你写了一个函数(label_encoding
),你可以用它来获取id的映射,有助于画图
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import textwrap
from io import StringIO
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from typing import Union, NamedTuple
from collections import defaultdict
def main():
df = import_file()
suitable_n: int
x = df.Page_Value.values.reshape(-1, 1)
if not 'Use the Elbow method to get a suitable N.':
# You can omit this if you don't want to see it at all.
elbow_find_n(x)
suitable_n = 3
# fit model
kmeans = KMeans(n_clusters=suitable_n, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(x)
# labelEncoder = LabelEncoder()
# labelEncoder.fit(df['page'])
# df['page'] = labelEncoder.transform(df['page'])
df.loc[:, ['page']], mapping_table = label_encoding(df.loc[:, ['page']])
df = rebuild_df(df, clusters, mapping_table) # 'page-id', 'page', 'Page_Value', 'clusters'
print(df)
dict_by_cluster_value = defaultdict(list)
for cur_set in set(df['clusters']): # Output the format that you define.
print(f'Cluster{cur_set} = {",".join(df.page[df.clusters == cur_set])}')
dict_by_cluster_value[cur_set].extend(df.page[df.clusters == cur_set].to_list())
print(dict(dict_by_cluster_value)) # it's ok with defaultdict, I convert the type is for print beautiful.
visualizing_the_clusters(kmeans, df)
class RGBColor(NamedTuple):
BLACK = '#000000'
# AZURE = '#F0FFFF'
OLIVE = '#808000'
PINK = '#FFC0CB'
# WHITE = '#000000' <-- not suitable put it on background is white.
GOLD = 'FFD700'
BLUE = '#0000FF'
GREEN = '#00FF00'
RED = '#FF0000'
YELLOW = '#FFFF00'
ORANGE = '#FFA500'
PURPLE = '#FF00FF'
def get_tuple(self):
return (attr_name for attr_name in dir(self) if not attr_name.startswith('_') and attr_name.isupper())
def label_encoding(label_col: Union[pd.DataFrame, np.ndarray], is_need_mapping_table=True) -> tuple:
"""
USAGE:
df.loc[:, ['col_xxx', ]], mapping_table = label_encoding(df.loc[:, ['col_xxx']])
"""
nda_rtn_value = LabelEncoder().fit_transform(label_col.values.ravel()) if isinstance(label_col, pd.DataFrame) else LabelEncoder().fit_transform(label_col)
rtn_dict = dict()
if is_need_mapping_table:
list_value = [e[0] for e in label_col.values] if isinstance(label_col, pd.DataFrame) else [e for e in label_col]
rtn_dict = dict(zip(nda_rtn_value, list_value))
if isinstance(label_col, pd.DataFrame):
nda_rtn_value = nda_rtn_value.reshape(-1, 1)
return nda_rtn_value, rtn_dict
def import_file() -> pd.DataFrame:
page_content = textwrap.dedent( # Remove any common leading whitespace from every line in text.
"""\
page,Page_Value
page1,12
page2,1
page3,2
page4,3
page5,10
page6,11
page7,13
page8,67
page9,70
"""
)
df = pd.read_csv(StringIO(page_content), header=0)
return df
def elbow_find_n(x):
wcss = []
for i in range(1, 10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15, 8))
plt.plot(range(1, 10), wcss, marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of Clusters')
plt.ylabel('within-cluster sums of squares WCSS')
plt.show()
def rebuild_df(df, clusters, mapping_table):
df['clusters'] = clusters
df.rename(columns={'page': 'page-id'}, inplace=True)
df['page'] = df.apply(lambda df_: mapping_table[df_['page-id']], axis=1)
df = df.reindex(['page-id', 'page', 'clusters', 'Page_Value', ], axis=1)
return df
def visualizing_the_clusters(kmeans: KMeans, df: pd.DataFrame):
standard_rgb = RGBColor()
# plt.scatter(df[df.clusters == 0]['page-id'], df[df.clusters == 0]['Page_Value'], s=2, c='red', label='Careful')
# plt.scatter(df[df.clusters == 2]['page-id'], df[df.clusters == 2]['Page_Value'], s=2, c='cyan', label='Careless')
# ...
for color_idx, (cur_set, color) in enumerate(
zip(set(df.clusters), standard_rgb.get_tuple())
):
contain_cluster_index = df.clusters == cur_set
plt.scatter(df[contain_cluster_index]['page-id'], df[contain_cluster_index]['Page_Value'],
s=2, c=color, label=f'Cluster{cur_set}: {kmeans.cluster_centers_[cur_set][0]}')
n_cluster = len(kmeans.cluster_centers_)
plt.scatter(np.ones(n_cluster)*(-1), kmeans.cluster_centers_[:, 0], s=(50/n_cluster), c='purple', label='Centroids')
plt.title('Page and Page_Value')
plt.xlabel('Page_ID')
plt.ylabel('Page_Value')
plt.legend(loc=(1.05, 0.5))
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()
输出
page-id page clusters Page_Value
0 0 page1 0 12
1 1 page2 2 1
2 2 page3 2 2
3 3 page4 2 3
4 4 page5 0 10
5 5 page6 0 11
6 6 page7 0 13
7 7 page8 1 67
8 8 page9 1 70
Cluster0 = page1,page5,page6,page7
Cluster1 = page8,page9
Cluster2 = page2,page3,page4
{0: ['page1', 'page5', 'page6', 'page7'], 1: ['page8', 'page9'], 2: ['page2', 'page3', 'page4']}