Jupiter Notebook:Input 包含 NaN、无穷大或对于 dtype('float64') 来说太大的值
Jupiter Notebook:Input contains NaN, infinity or a value too large for dtype('float64')
我正在尝试使用我的特征选择来拟合我的数据,但每当我尝试时我都会收到此错误
Input contains NaN, infinity or a value too large for dtype('float64').
不确定是否有解决办法,或者我是否可以给它一个不同的方法,但我知道的是,如果我要使用特征选择,我必须在我的算法之前应用它,
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from sklearn.model_selection import train_test_split
# In[2]:
import pandas as pd
# In[3]:
import matplotlib.pyplot as plt
# In[4]:
import numpy as np
# In[5]:
import seaborn as sns
import statsmodels.api as sm
import mglearn as mg
get_ipython().run_line_magic('matplotlib', 'inline')
# In[6]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import RFE
from sklearn. feature_selection import SelectFromModel
# In[7]:
first_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
# In[8]:
print(first_file)
# In[9]:
second_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
third_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv")
fourth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv")
fifth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
sixth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
seventh_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv")
eighth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv")
# In[10]:
print(second_file)
# In[11]:
print(third_file)
# In[12]:
print(fourth_file)
# In[13]:
print(fifth_file)
# In[14]:
print(sixth_file)
# In[15]:
print(seventh_file)
# In[16]:
print(eighth_file)
# In[17]:
first_file = first_file.loc[:,~first_file.columns.duplicated()]
# In[18]:
print(first_file)
# In[19]:
df_list = [first_file, second_file,third_file,fourth_file, fifth_file, sixth_file, seventh_file,eighth_file]
# In[20]:
merged_data = pd.concat(df_list)
# In[21]:
print(merged_data)
# print(merged_data.shape)
# In[22]:
print(merged_data.shape)
# In[23]:
print(first_file.shape)
# In[24]:
print(second_file.shape)
# In[25]:
print(third_file.shape)
# In[26]:
print(fourth_file.shape)
# In[27]:
print(fifth_file.shape)
# In[28]:
print(sixth_file.shape)
# In[29]:
print(seventh_file.shape)
# In[30]:
print(eighth_file.shape)
# In[31]:
# 2830540 number weretrying to get
# In[32]:
# df = merged_data.dropna()
# In[33]:
print(merged_data.shape)
# In[34]:
merged_data.dropna(inplace = True)
# In[35]:
print(merged_data.shape)
# In[36]:
df = merged_data
# In[37]:
df.shape
# In[38]:
df.dropna(inplace = True)
# In[39]:
keys_num = len(df.keys())
# In[40]:
df.keys()
# In[41]:
df.head()
# In[42]:
df.keys()
# In[43]:
y= df[' Label']
# In[44]:
y
# In[45]:
X = df.drop(' Label',axis=1)
# In[46]:
X
# In[47]:
estimator = AdaBoostRegressor(n_estimators=100, random_state=0)
# In[48]:
selector = RFE(estimator, n_features_to_select=12, step=1)
# In[49]:
selector.fit(X, y)
# In[50]:
cid = AdaBoostRegressor( n_estimators=100,random_state=0)
# In[51]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
# In[52]:
cid.fit(X_train, y_train)
# In[ ]:
trainning_score = cid.score(X_train, y_train)
# In[ ]:
print("Training Score: {0}".format(trainning_score))
# In[ ]:
testing_score = cid.score(X_test, y_test)
# In[ ]:
print("Test Score: {0}".format(testing_score))
# In[ ]:
print(X_train.shape)
# In[ ]:
print(X_test.shape)
# In[ ]:
estimator = AdaBoostRegressor(n_estimators=100, random_state=0)
# In[ ]:
selector = RFE(estimator, n_features_to_select=5, step=1)
# In[ ]:
selector = selector.fit(X_train, y_train)
# In[ ]:
trainning_score = selector.score(X_train, y_train)
# In[ ]:
print("Training Score: {0}".format(trainning_score))
# In[ ]:
This is the ERROR!!!:
In[ ]: selector.fit(X, y)
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in
----> 1 selector.fit(X, y)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py
适合(自我,X,Y)
182 目标值。
第183话
--> 184 return self._fit(X, y)
185
186 def _fit(自我,X,Y,step_score=None):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py
在 _fit(self, X, y, step_score)
191
192 个标签 = self._get_tags()
--> 193 X, y = self._validate_data(
第194话
195ensure_min_features=2,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py中
_validate_data(self, X, y, reset, validate_separately, **check_params)
第431话
432 其他:
--> 433 X, y = check_X_y(X, y, **check_params)
434出=X,y
435
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py
在 inner_f(*args, **kwargs)
61 extra_args = len(参数) - len(all_args)
62 如果 extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65#extra_args > 0
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py
在 check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order,
复制,force_all_finite,ensure_2d,allow_nd,multi_output,
ensure_min_samples、ensure_min_features、y_numeric、估算器)
812 引发 ValueError(“y 不能是 None”)
813
--> 814 X = check_array(X, accept_sparse=accept_sparse,
815accept_large_sparse=accept_large_sparse,
第816话
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py
在 inner_f(*args, **kwargs)
61 extra_args = len(参数) - len(all_args)
62 如果 extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65#extra_args > 0
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py
在 check_array(数组, accept_sparse, accept_large_sparse, dtype,
订购、复制、force_all_finite、ensure_2d、allow_nd、
ensure_min_samples、ensure_min_features、估算器)
661
662 如果 force_all_finite:
--> 663 _assert_all_finite(数组,
664allow_nan=force_all_finite=='allow-nan')
665
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py
在 _assert_all_finite(X, allow_nan, msg_dtype)
101 不是 allow_nan 也不是 np.isfinite(X).all()):
102 type_err = 'infinity' 如果 allow_nan 否则 'NaN, infinity'
--> 103 引发 ValueError(
104msg_err.format
105 (type_err,ValueError: 输入包含 NaN、无穷大或对于 dtype('float64') 来说太大的值。
这可能不是您想听到的答案,但它有一定的道理。
尝试进行几乎任何类型的训练或数据分析时,一个好的做法是首先清理数据。这些步骤之一可能包括移除或处理 'Nan'、'infinity' 或不合适的异常值。
有多种方法可以做到这一点,但对于您的情况,我建议您先从以下方面着手:
- 删除具有 Nan 值的行。
- 删除具有无穷大值的行。
- 移动所有值,使其在 float64 数据大小范围内,或者删除包含超出 float64 数据大小的数字的行。
- 删除范围过大的列。
这是我经常使用的功能,首先检查数据以进行清理。
def calc_summary_for(feature_name:str, data:DataFrame) -> DataFrame:
"""
Calculates Summary Features in list of 'summary_feature_names'.
Parameters
----------
feature_name : str
Name of feature for summary to be calculated for. required
data : pandas.DataFrame object
A DataFrame object containg column named feature_name. required
Returns : DataFrame object of summary calculated features.
"""
summary_feature_names = ['Feature Name', 'Cardinality', 'Non-null Count', 'Null Count', 'Min', '25th', 'Mean',
'50th', '75th', 'Max', 'Std. Dev','Outlier Count Low', 'Outlier Count High']
# Create DataFrame to return and list at loc to DataFram
frame2return = pd.DataFrame(columns=summary_feature_names)
list2add = []
# Find claculated features that have bult in functions
list2add.append(feature_name)
list2add.append(data.shape[0])
list2add.append(data[feature_name].notnull().sum())
list2add.append(data[feature_name].isnull().sum())
list2add.append(data[feature_name].min())
list2add.append(data[feature_name].quantile(q=0.25))
list2add.append(data[feature_name].mean())
# Save for Calcuating IQR
list2add.append(data[feature_name].quantile(q=0.5))
Q1 = data[feature_name].quantile(q=0.5)
list2add.append(data[feature_name].quantile(q=0.75))
Q3 = data[feature_name].quantile(q=0.75)
list2add.append(data[feature_name].max())
list2add.append(data[feature_name].std())
# Find IQR
IQR = Q3 - Q1
# Find Range for outliers
outerBoundLow = Q1 - (1.5 * IQR)
outerBoundHigh = Q3 + (1.5 * IQR)
# Find calculated features using IQR counting outliers
countLow = 0
countHigh = 0
countInBounds = 0
for i in data[feature_name]:
if i < outerBoundLow:
countLow +=1
elif i > outerBoundHigh:
countHigh +=1
else :
countInBounds +=1
list2add.append(countLow)
list2add.append(countHigh)
# Add list to Dataframe and return it
frame2return.loc[len(frame2return)] = list2add
return frame2return
我可以通过用 nan 替换无限数据然后删除 nan 数据来解决问题
xy = X
xy[" Label"] = y_df
#xy = dataframe
# Replace infinite updated data with nan
X.replace([np.inf,-np. inf], np.nan, inplace=True)
# Drop rows with NaN
X. dropna (inplace=True)
我正在尝试使用我的特征选择来拟合我的数据,但每当我尝试时我都会收到此错误
Input contains NaN, infinity or a value too large for dtype('float64').
不确定是否有解决办法,或者我是否可以给它一个不同的方法,但我知道的是,如果我要使用特征选择,我必须在我的算法之前应用它,
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from sklearn.model_selection import train_test_split
# In[2]:
import pandas as pd
# In[3]:
import matplotlib.pyplot as plt
# In[4]:
import numpy as np
# In[5]:
import seaborn as sns
import statsmodels.api as sm
import mglearn as mg
get_ipython().run_line_magic('matplotlib', 'inline')
# In[6]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import RFE
from sklearn. feature_selection import SelectFromModel
# In[7]:
first_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
# In[8]:
print(first_file)
# In[9]:
second_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
third_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv")
fourth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv")
fifth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
sixth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
seventh_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv")
eighth_file = pd.read_csv(r"/Users/feliperivas/Downloads/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv")
# In[10]:
print(second_file)
# In[11]:
print(third_file)
# In[12]:
print(fourth_file)
# In[13]:
print(fifth_file)
# In[14]:
print(sixth_file)
# In[15]:
print(seventh_file)
# In[16]:
print(eighth_file)
# In[17]:
first_file = first_file.loc[:,~first_file.columns.duplicated()]
# In[18]:
print(first_file)
# In[19]:
df_list = [first_file, second_file,third_file,fourth_file, fifth_file, sixth_file, seventh_file,eighth_file]
# In[20]:
merged_data = pd.concat(df_list)
# In[21]:
print(merged_data)
# print(merged_data.shape)
# In[22]:
print(merged_data.shape)
# In[23]:
print(first_file.shape)
# In[24]:
print(second_file.shape)
# In[25]:
print(third_file.shape)
# In[26]:
print(fourth_file.shape)
# In[27]:
print(fifth_file.shape)
# In[28]:
print(sixth_file.shape)
# In[29]:
print(seventh_file.shape)
# In[30]:
print(eighth_file.shape)
# In[31]:
# 2830540 number weretrying to get
# In[32]:
# df = merged_data.dropna()
# In[33]:
print(merged_data.shape)
# In[34]:
merged_data.dropna(inplace = True)
# In[35]:
print(merged_data.shape)
# In[36]:
df = merged_data
# In[37]:
df.shape
# In[38]:
df.dropna(inplace = True)
# In[39]:
keys_num = len(df.keys())
# In[40]:
df.keys()
# In[41]:
df.head()
# In[42]:
df.keys()
# In[43]:
y= df[' Label']
# In[44]:
y
# In[45]:
X = df.drop(' Label',axis=1)
# In[46]:
X
# In[47]:
estimator = AdaBoostRegressor(n_estimators=100, random_state=0)
# In[48]:
selector = RFE(estimator, n_features_to_select=12, step=1)
# In[49]:
selector.fit(X, y)
# In[50]:
cid = AdaBoostRegressor( n_estimators=100,random_state=0)
# In[51]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
# In[52]:
cid.fit(X_train, y_train)
# In[ ]:
trainning_score = cid.score(X_train, y_train)
# In[ ]:
print("Training Score: {0}".format(trainning_score))
# In[ ]:
testing_score = cid.score(X_test, y_test)
# In[ ]:
print("Test Score: {0}".format(testing_score))
# In[ ]:
print(X_train.shape)
# In[ ]:
print(X_test.shape)
# In[ ]:
estimator = AdaBoostRegressor(n_estimators=100, random_state=0)
# In[ ]:
selector = RFE(estimator, n_features_to_select=5, step=1)
# In[ ]:
selector = selector.fit(X_train, y_train)
# In[ ]:
trainning_score = selector.score(X_train, y_train)
# In[ ]:
print("Training Score: {0}".format(trainning_score))
# In[ ]:
This is the ERROR!!!:
In[ ]: selector.fit(X, y)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) in ----> 1 selector.fit(X, y)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py 适合(自我,X,Y) 182 目标值。 第183话 --> 184 return self._fit(X, y) 185 186 def _fit(自我,X,Y,step_score=None):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py 在 _fit(self, X, y, step_score) 191 192 个标签 = self._get_tags() --> 193 X, y = self._validate_data( 第194话 195ensure_min_features=2,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py中 _validate_data(self, X, y, reset, validate_separately, **check_params) 第431话 432 其他: --> 433 X, y = check_X_y(X, y, **check_params) 434出=X,y 435
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py 在 inner_f(*args, **kwargs) 61 extra_args = len(参数) - len(all_args) 62 如果 extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65#extra_args > 0
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py 在 check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, 复制,force_all_finite,ensure_2d,allow_nd,multi_output, ensure_min_samples、ensure_min_features、y_numeric、估算器) 812 引发 ValueError(“y 不能是 None”) 813 --> 814 X = check_array(X, accept_sparse=accept_sparse, 815accept_large_sparse=accept_large_sparse, 第816话
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py 在 inner_f(*args, **kwargs) 61 extra_args = len(参数) - len(all_args) 62 如果 extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65#extra_args > 0
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py 在 check_array(数组, accept_sparse, accept_large_sparse, dtype, 订购、复制、force_all_finite、ensure_2d、allow_nd、 ensure_min_samples、ensure_min_features、估算器) 661 662 如果 force_all_finite: --> 663 _assert_all_finite(数组, 664allow_nan=force_all_finite=='allow-nan') 665
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py 在 _assert_all_finite(X, allow_nan, msg_dtype) 101 不是 allow_nan 也不是 np.isfinite(X).all()): 102 type_err = 'infinity' 如果 allow_nan 否则 'NaN, infinity' --> 103 引发 ValueError( 104msg_err.format 105 (type_err,ValueError: 输入包含 NaN、无穷大或对于 dtype('float64') 来说太大的值。
这可能不是您想听到的答案,但它有一定的道理。
尝试进行几乎任何类型的训练或数据分析时,一个好的做法是首先清理数据。这些步骤之一可能包括移除或处理 'Nan'、'infinity' 或不合适的异常值。
有多种方法可以做到这一点,但对于您的情况,我建议您先从以下方面着手:
- 删除具有 Nan 值的行。
- 删除具有无穷大值的行。
- 移动所有值,使其在 float64 数据大小范围内,或者删除包含超出 float64 数据大小的数字的行。
- 删除范围过大的列。
这是我经常使用的功能,首先检查数据以进行清理。
def calc_summary_for(feature_name:str, data:DataFrame) -> DataFrame:
"""
Calculates Summary Features in list of 'summary_feature_names'.
Parameters
----------
feature_name : str
Name of feature for summary to be calculated for. required
data : pandas.DataFrame object
A DataFrame object containg column named feature_name. required
Returns : DataFrame object of summary calculated features.
"""
summary_feature_names = ['Feature Name', 'Cardinality', 'Non-null Count', 'Null Count', 'Min', '25th', 'Mean',
'50th', '75th', 'Max', 'Std. Dev','Outlier Count Low', 'Outlier Count High']
# Create DataFrame to return and list at loc to DataFram
frame2return = pd.DataFrame(columns=summary_feature_names)
list2add = []
# Find claculated features that have bult in functions
list2add.append(feature_name)
list2add.append(data.shape[0])
list2add.append(data[feature_name].notnull().sum())
list2add.append(data[feature_name].isnull().sum())
list2add.append(data[feature_name].min())
list2add.append(data[feature_name].quantile(q=0.25))
list2add.append(data[feature_name].mean())
# Save for Calcuating IQR
list2add.append(data[feature_name].quantile(q=0.5))
Q1 = data[feature_name].quantile(q=0.5)
list2add.append(data[feature_name].quantile(q=0.75))
Q3 = data[feature_name].quantile(q=0.75)
list2add.append(data[feature_name].max())
list2add.append(data[feature_name].std())
# Find IQR
IQR = Q3 - Q1
# Find Range for outliers
outerBoundLow = Q1 - (1.5 * IQR)
outerBoundHigh = Q3 + (1.5 * IQR)
# Find calculated features using IQR counting outliers
countLow = 0
countHigh = 0
countInBounds = 0
for i in data[feature_name]:
if i < outerBoundLow:
countLow +=1
elif i > outerBoundHigh:
countHigh +=1
else :
countInBounds +=1
list2add.append(countLow)
list2add.append(countHigh)
# Add list to Dataframe and return it
frame2return.loc[len(frame2return)] = list2add
return frame2return
我可以通过用 nan 替换无限数据然后删除 nan 数据来解决问题
xy = X
xy[" Label"] = y_df
#xy = dataframe
# Replace infinite updated data with nan
X.replace([np.inf,-np. inf], np.nan, inplace=True)
# Drop rows with NaN
X. dropna (inplace=True)