自定义转换器单独工作,但在将它们组合到一个管道中时会发生故障
Custom Transformers work individually but breaks down when combining them into one Pipeline
我需要一些调试帮助。我正在使用 scikit-learn 来处理一些数据并训练 ML 模型来预测房价。我想出了 2 个自定义转换器,可以处理不需要的功能,还可以结合一些功能来创建新功能。当我调用它们时,这两个自定义转换器单独工作,但一旦我将它们组合到一个管道中以改进工作流程,我就会收到错误消息。我不确定是什么问题。
例如,这是第一个变压器:
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
然后进行测试:
relevant_columns = ['OverallQual','GrLivArea','GarageCars','GarageArea','YearBuilt','BsmtFinSF1','FullBath',
'GarageYrBlt', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'HalfBath']
cs = ColumnSelector(columns=relevant_columns)
transformed = cs.fit_transform(X_train)
transformed.head()
returns this dataframe.
同样,
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
total_bsmt_sa_ix, second_flr_ix, first_flr_ix, full_bath_ix, half_bath_ix = [
list(transformed.columns).index(col) for col in ('TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'FullBath', 'HalfBath')]
def __init__(self, add_total_sa=True, add_total_baths=True):
self.add_total_sa = add_total_sa
self.add_total_baths = add_total_baths
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if self.add_total_sa and self.add_total_baths:
total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
return np.c_[X, total_sa, total_bath]
elif self.add_total_sa:
total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
return np.c_[X, total_sa]
elif self.add_total_baths:
total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
return np.c_[X, total_bath]
else:
pass
atr_adder = CombinedAttributesAdder()
housing_extra_attr = atr_adder.transform(transformed.values)
housing_extra_attr = pd.DataFrame(housing_extra_attr, columns=relevant_columns+['total_sa', 'total_bath'], index=transformed.index)
housing_extra_attr.head()
returns this
然而,当我像这样制作管道时:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
('column_selector', ColumnSelector(columns=relevant_columns)),
('attr adder', CombinedAttributesAdder()),
('scaler', StandardScaler())
])
X_train_prepd = pipeline.fit(X_train)
我收到此错误消息
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-107-ab78197544be> in <module>
8 ])
9
---> 10 X_train_prepd = pipeline.fit(X_train)
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
~\Anaconda3\envs\ml_book\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
<ipython-input-94-607115cdc09e> in transform(self, X, y)
13
14 if self.add_total_sa and self.add_total_baths:
---> 15 total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
16 total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
17 return np.c_[X, total_sa, total_bath]
~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2978 if self.columns.nlevels > 1:
2979 return self._getitem_multilevel(key)
-> 2980 indexer = self.columns.get_loc(key)
2981 if is_integer(indexer):
2982 indexer = [indexer]
~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 )
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
TypeError: '(slice(None, None, None), 8)' is an invalid key
有没有人知道可能出了什么问题?我真的很沮丧。
感谢帮助
找出问题所在。
问题出在我如何调用 CombinedAttributesAdder()
中列的索引。解决方案是将计算更改为以下内容:
if self.add_total_sa and self.add_total_baths:
total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
return np.c_[X, total_sa, total_bath]
elif self.add_total_sa:
total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
return np.c_[X, total_sa]
elif self.add_total_baths:
total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
return np.c_[X, total_bath]
else:
pass
我所做的只是在每个计算中添加 X.iloc
我需要一些调试帮助。我正在使用 scikit-learn 来处理一些数据并训练 ML 模型来预测房价。我想出了 2 个自定义转换器,可以处理不需要的功能,还可以结合一些功能来创建新功能。当我调用它们时,这两个自定义转换器单独工作,但一旦我将它们组合到一个管道中以改进工作流程,我就会收到错误消息。我不确定是什么问题。 例如,这是第一个变压器:
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
然后进行测试:
relevant_columns = ['OverallQual','GrLivArea','GarageCars','GarageArea','YearBuilt','BsmtFinSF1','FullBath',
'GarageYrBlt', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'HalfBath']
cs = ColumnSelector(columns=relevant_columns)
transformed = cs.fit_transform(X_train)
transformed.head()
returns this dataframe.
同样,
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
total_bsmt_sa_ix, second_flr_ix, first_flr_ix, full_bath_ix, half_bath_ix = [
list(transformed.columns).index(col) for col in ('TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'FullBath', 'HalfBath')]
def __init__(self, add_total_sa=True, add_total_baths=True):
self.add_total_sa = add_total_sa
self.add_total_baths = add_total_baths
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if self.add_total_sa and self.add_total_baths:
total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
return np.c_[X, total_sa, total_bath]
elif self.add_total_sa:
total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
return np.c_[X, total_sa]
elif self.add_total_baths:
total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
return np.c_[X, total_bath]
else:
pass
atr_adder = CombinedAttributesAdder()
housing_extra_attr = atr_adder.transform(transformed.values)
housing_extra_attr = pd.DataFrame(housing_extra_attr, columns=relevant_columns+['total_sa', 'total_bath'], index=transformed.index)
housing_extra_attr.head()
returns this
然而,当我像这样制作管道时:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
('column_selector', ColumnSelector(columns=relevant_columns)),
('attr adder', CombinedAttributesAdder()),
('scaler', StandardScaler())
])
X_train_prepd = pipeline.fit(X_train)
我收到此错误消息
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-107-ab78197544be> in <module>
8 ])
9
---> 10 X_train_prepd = pipeline.fit(X_train)
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
~\Anaconda3\envs\ml_book\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
<ipython-input-94-607115cdc09e> in transform(self, X, y)
13
14 if self.add_total_sa and self.add_total_baths:
---> 15 total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
16 total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
17 return np.c_[X, total_sa, total_bath]
~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2978 if self.columns.nlevels > 1:
2979 return self._getitem_multilevel(key)
-> 2980 indexer = self.columns.get_loc(key)
2981 if is_integer(indexer):
2982 indexer = [indexer]
~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 )
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
TypeError: '(slice(None, None, None), 8)' is an invalid key
有没有人知道可能出了什么问题?我真的很沮丧。 感谢帮助
找出问题所在。
问题出在我如何调用 CombinedAttributesAdder()
中列的索引。解决方案是将计算更改为以下内容:
if self.add_total_sa and self.add_total_baths:
total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
return np.c_[X, total_sa, total_bath]
elif self.add_total_sa:
total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
return np.c_[X, total_sa]
elif self.add_total_baths:
total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
return np.c_[X, total_bath]
else:
pass
我所做的只是在每个计算中添加 X.iloc