使用 k-means,我得到了一个错误;具有 0 个特征的数组
Using k-means, I got a error ; array with 0 feature
我正在尝试使用 matplotlib 和 k-means 对我的 csv 数据进行聚类。
我的 csv 数据是关于能源消耗的。
https://github.com/camenergydatalab/EnergyDataSimulationChallenge/blob/master/challenge2/data/total_watt.csv
我想将每天的值分为 3 组:低、中和高能耗。
这是我的代码。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans
MY_FILE='total_watt.csv'
date = []
consumption = []
df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
for row in df:
if len(row) ==2 :
date.append(row[0])
consumption.append(row[1])
import datetime
for x in range(len(date)):
date[x]=datetime.datetime.strptime(date[x], '%Y-%m-%d %H:%M:%S')
X = np.array([date, consumption])
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(centroids)
print(labels)
colors = ["b.","g.","r."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
但是当我执行这段代码时,出现了如下错误;
(DataVizProj)Soma-Suzuki:Soma Suzuki$ python 4.clusters.py
Traceback (most recent call last):
File "4.clusters.py", line 31, in <module>
kmeans.fit(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 785, in fit
X = self._check_fit_data(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 755, in _check_fit_data
X = check_array(X, accept_sparse='csr', dtype=np.float64)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/utils/validation.py", line 367, in check_array
% (n_features, shape_repr, ensure_min_features))
ValueError: Found array with 0 feature(s) (shape=(2, 0)) while a minimum of 1 is required.
如何正确聚类我的 csv 数据??
编辑---------------------------------------- --------
这是我的新代码。谢谢!
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans
MY_FILE='total_watt.csv'
date = []
consumption = []
df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
df = df.dropna()
date = df.index.tolist()
consumption = df[df.columns[0]].values
X = np.array([date, consumption])
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(centroids)
print(labels)
colors = ["b.","g.","r."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
和新错误...
(DataVizProj)Soma-Suzuki:Soma Suzuki$ python 4.clusters.py
Traceback (most recent call last):
File "4.clusters.py", line 26, in <module>
kmeans.fit(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 785, in fit
X = self._check_fit_data(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 755, in _check_fit_data
X = check_array(X, accept_sparse='csr', dtype=np.float64)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/utils/validation.py", line 344, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
TypeError: float() argument must be a string or a number
EDITED2----------------------------------------
谢谢建勋!!
我终于成功聚类了我的 csv 数据!!
非常感谢!!
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans
MY_FILE='total_watt.csv'
date = []
consumption = []
df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
df = df.dropna()
date = df.index.tolist()
date = [x.strftime('%Y-%m-%d') for x in date]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
date_numeric = encoder.fit_transform(date)
consumption = df[df.columns[0]].values
X = np.array([date_numeric, consumption]).T
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(centroids)
print(labels)
colors = ["b.","r.","g."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
但是正如你所看到的,虽然我们设置得当,但 x 轴并没有反映时间....
第一个问题:
for row in df:
if len(row) ==2 :
date.append(row[0])
consumption.append(row[1])
这会给你意想不到的空列表 date
和 consumption
因为 for row in df
实际上循环遍历列而不是行,这正是你看到错误消息的原因它没有任何功能。
此外,我看到消费中有两个 NaN
,所以你需要 df = df.dropna()
(或估算这些缺失值),因为 sklearn
不是 NaN
宽容。
要从数据框中获取数据,您可以这样写
date = df.index.tolist()
consumption = df[df.columns[0]].values
接下来,您已经解析了 pd.read_csv
中的日期,因此您的代码的以下部分将根本不起作用。
import datetime
for x in range(len(date)):
date[x]=datetime.datetime.strptime(date[x], '%Y-%m-%d %H:%M:%S')
最后,仅将带有 consumption
的原始 date
送入 KMeans
不会产生太多有用的结果。您应该考虑将 date
转换为数值数据,例如,星期几的虚拟变量。
使用LabelEncoder
:
date = df.index.tolist()
date = [x.strftime('%Y-%m-%d') for x in date]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
date_numeric = encoder.fit_transform(date)
# feed date_numeric with consumption into your KMeans
# must use .T to transpose your X, sklearn think each column is a feature
X = np.array([date_numeric, consumption]).T
对于您的绘图问题:
fig, ax = plt.subplots(figsize=(10,8))
colors = ["b.","r.","g."]
for i in range(len(X)):
print("coordinate:",encoder.inverse_transform(X[i,0].astype(int)), X[i,1], "label:", labels[i])
ax.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
ax.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
a = np.arange(0, len(X), 5)
ax.set_xticks(a)
ax.set_xticklabels(encoder.inverse_transform(a.astype(int)))
我正在尝试使用 matplotlib 和 k-means 对我的 csv 数据进行聚类。
我的 csv 数据是关于能源消耗的。 https://github.com/camenergydatalab/EnergyDataSimulationChallenge/blob/master/challenge2/data/total_watt.csv
我想将每天的值分为 3 组:低、中和高能耗。
这是我的代码。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans
MY_FILE='total_watt.csv'
date = []
consumption = []
df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
for row in df:
if len(row) ==2 :
date.append(row[0])
consumption.append(row[1])
import datetime
for x in range(len(date)):
date[x]=datetime.datetime.strptime(date[x], '%Y-%m-%d %H:%M:%S')
X = np.array([date, consumption])
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(centroids)
print(labels)
colors = ["b.","g.","r."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
但是当我执行这段代码时,出现了如下错误;
(DataVizProj)Soma-Suzuki:Soma Suzuki$ python 4.clusters.py
Traceback (most recent call last):
File "4.clusters.py", line 31, in <module>
kmeans.fit(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 785, in fit
X = self._check_fit_data(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 755, in _check_fit_data
X = check_array(X, accept_sparse='csr', dtype=np.float64)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/utils/validation.py", line 367, in check_array
% (n_features, shape_repr, ensure_min_features))
ValueError: Found array with 0 feature(s) (shape=(2, 0)) while a minimum of 1 is required.
如何正确聚类我的 csv 数据??
编辑---------------------------------------- --------
这是我的新代码。谢谢!
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans
MY_FILE='total_watt.csv'
date = []
consumption = []
df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
df = df.dropna()
date = df.index.tolist()
consumption = df[df.columns[0]].values
X = np.array([date, consumption])
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(centroids)
print(labels)
colors = ["b.","g.","r."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
和新错误...
(DataVizProj)Soma-Suzuki:Soma Suzuki$ python 4.clusters.py
Traceback (most recent call last):
File "4.clusters.py", line 26, in <module>
kmeans.fit(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 785, in fit
X = self._check_fit_data(X)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/cluster/k_means_.py", line 755, in _check_fit_data
X = check_array(X, accept_sparse='csr', dtype=np.float64)
File "/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/sklearn/utils/validation.py", line 344, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
TypeError: float() argument must be a string or a number
EDITED2----------------------------------------
谢谢建勋!!
我终于成功聚类了我的 csv 数据!! 非常感谢!!
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans
MY_FILE='total_watt.csv'
date = []
consumption = []
df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
df = df.dropna()
date = df.index.tolist()
date = [x.strftime('%Y-%m-%d') for x in date]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
date_numeric = encoder.fit_transform(date)
consumption = df[df.columns[0]].values
X = np.array([date_numeric, consumption]).T
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(centroids)
print(labels)
colors = ["b.","r.","g."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
第一个问题:
for row in df:
if len(row) ==2 :
date.append(row[0])
consumption.append(row[1])
这会给你意想不到的空列表 date
和 consumption
因为 for row in df
实际上循环遍历列而不是行,这正是你看到错误消息的原因它没有任何功能。
此外,我看到消费中有两个 NaN
,所以你需要 df = df.dropna()
(或估算这些缺失值),因为 sklearn
不是 NaN
宽容。
要从数据框中获取数据,您可以这样写
date = df.index.tolist()
consumption = df[df.columns[0]].values
接下来,您已经解析了 pd.read_csv
中的日期,因此您的代码的以下部分将根本不起作用。
import datetime
for x in range(len(date)):
date[x]=datetime.datetime.strptime(date[x], '%Y-%m-%d %H:%M:%S')
最后,仅将带有 consumption
的原始 date
送入 KMeans
不会产生太多有用的结果。您应该考虑将 date
转换为数值数据,例如,星期几的虚拟变量。
使用LabelEncoder
:
date = df.index.tolist()
date = [x.strftime('%Y-%m-%d') for x in date]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
date_numeric = encoder.fit_transform(date)
# feed date_numeric with consumption into your KMeans
# must use .T to transpose your X, sklearn think each column is a feature
X = np.array([date_numeric, consumption]).T
对于您的绘图问题:
fig, ax = plt.subplots(figsize=(10,8))
colors = ["b.","r.","g."]
for i in range(len(X)):
print("coordinate:",encoder.inverse_transform(X[i,0].astype(int)), X[i,1], "label:", labels[i])
ax.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
ax.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
a = np.arange(0, len(X), 5)
ax.set_xticks(a)
ax.set_xticklabels(encoder.inverse_transform(a.astype(int)))