如何创建自定义 Python class 以在管道中用于删除高度相关的功能?
How to create a custom Python class to be used in Pipeline for dropping highly correlated features?
我正在使用 Sklearn Pipeline
+ GridSearchCV
对深度神经网络(回归)进行 data-preprocessing/hyperparameter 调整。
对于预处理,我需要一个自定义 class 来删除数据集中高度相关的列。这是我的代码(有待改进):
class MyDecorrelator():
def __init__(self, threshold):
self.threshold = threshold
def fit(self, X, y=None):
return self
def transform(self, X, y = None):
correlated_features = set() # Set of all the names of correlated columns
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
return X.drop(labels=correlated_features, axis=1, inplace=True)
def create_model(input_shape = 150, optimizer='adam', learn_rate=0.01, activation='relu', init='uniform', hidden_layers = 1, dropout = 0.5, hidden_size=64):
# create model
model = Sequential()
model.add(Dense(input_shape, activation=activation, kernel_initializer=init, ))
for i in range(hidden_layers):
model.add(Dense(hidden_size, activation=activation))
model.add(Dropout(dropout), )
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_absolute_error', optimizer=optimizer)
return model
estimator = Pipeline([
('scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
('decorrelation', MyDecorrelator(0.9)),
('feature_selector', SelectKBest()),
('kr', KerasRegressor(build_fn = create_model))
], verbose = True)
param_grid = [{
'kr__optimizer': ['RMSprop', 'Adam'],
'kr__epochs': [100, 300],
#'kr__init': [ 'uniform', 'zeros', 'normal', ],
'kr__batch_size':[32, 128],
'kr__learn_rate': [0.01, 0.1],
'kr__activation': ['relu', 'sigmoid'],
'kr__dropout': [0.9, 0.1],
'kr__hidden_layers': [2, 3],
'kr__hidden_size': [64, 128],
'feature_selector__score_func': [mutual_info_regression],
'feature_selector__k': [k],
'kr__input_shape': [k]
}
for k in [50, 100] ]
grid = HalvingGridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=KFold(n_splits = 5), verbose=10)
但是,当我尝试 运行 grid.fit(X, Y)
时,出现以下错误:
'MyDecorrelator' object has no attribute 'set_params'
此外,如果我尝试将第一行更改为 class MyDecorrelator(BaseEstimator):
,它会显示
AttributeError: 'numpy.ndarray' object has no attribute 'corr'
如何解决?
更新:
我已经使用 Comsavvy 的解决方案进行了更正,但结果我收到警告:UserWarning: One or more of the test scores are non-finite: [nan nan nan ...]
。怎么会发生?它在没有去相关的情况下工作。
我建议你把MyDecorrelator
的class的transform()
方法改成下面的代码,看看结果,
def transform(self, X, y = None):
correlated_features = set() # Set of all the names of correlated columns
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
return X.drop(labels=correlated_features, axis=1)
这只是通过测试 X
是否是 DataFrame 来完成,如果不是,我们将其更改为 DataFrame。
这样代码运行正确,
class MyDecorrelator(BaseEstimator, TransformerMixin):
def __init__(self, threshold):
self.threshold = threshold
self.correlated_columns = None
def fit(self, X, y=None):
correlated_features = set()
X = pd.DataFrame(X)
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
self.correlated_features = correlated_features
return self
def transform(self, X, y=None, **kwargs):
return (pd.DataFrame(X)).drop(labels=self.correlated_features, axis=1)
我正在使用 Sklearn Pipeline
+ GridSearchCV
对深度神经网络(回归)进行 data-preprocessing/hyperparameter 调整。
对于预处理,我需要一个自定义 class 来删除数据集中高度相关的列。这是我的代码(有待改进):
class MyDecorrelator():
def __init__(self, threshold):
self.threshold = threshold
def fit(self, X, y=None):
return self
def transform(self, X, y = None):
correlated_features = set() # Set of all the names of correlated columns
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
return X.drop(labels=correlated_features, axis=1, inplace=True)
def create_model(input_shape = 150, optimizer='adam', learn_rate=0.01, activation='relu', init='uniform', hidden_layers = 1, dropout = 0.5, hidden_size=64):
# create model
model = Sequential()
model.add(Dense(input_shape, activation=activation, kernel_initializer=init, ))
for i in range(hidden_layers):
model.add(Dense(hidden_size, activation=activation))
model.add(Dropout(dropout), )
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_absolute_error', optimizer=optimizer)
return model
estimator = Pipeline([
('scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
('decorrelation', MyDecorrelator(0.9)),
('feature_selector', SelectKBest()),
('kr', KerasRegressor(build_fn = create_model))
], verbose = True)
param_grid = [{
'kr__optimizer': ['RMSprop', 'Adam'],
'kr__epochs': [100, 300],
#'kr__init': [ 'uniform', 'zeros', 'normal', ],
'kr__batch_size':[32, 128],
'kr__learn_rate': [0.01, 0.1],
'kr__activation': ['relu', 'sigmoid'],
'kr__dropout': [0.9, 0.1],
'kr__hidden_layers': [2, 3],
'kr__hidden_size': [64, 128],
'feature_selector__score_func': [mutual_info_regression],
'feature_selector__k': [k],
'kr__input_shape': [k]
}
for k in [50, 100] ]
grid = HalvingGridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=KFold(n_splits = 5), verbose=10)
但是,当我尝试 运行 grid.fit(X, Y)
时,出现以下错误:
'MyDecorrelator' object has no attribute 'set_params'
此外,如果我尝试将第一行更改为 class MyDecorrelator(BaseEstimator):
,它会显示
AttributeError: 'numpy.ndarray' object has no attribute 'corr'
如何解决?
更新:
我已经使用 Comsavvy 的解决方案进行了更正,但结果我收到警告:UserWarning: One or more of the test scores are non-finite: [nan nan nan ...]
。怎么会发生?它在没有去相关的情况下工作。
我建议你把MyDecorrelator
的class的transform()
方法改成下面的代码,看看结果,
def transform(self, X, y = None):
correlated_features = set() # Set of all the names of correlated columns
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
return X.drop(labels=correlated_features, axis=1)
这只是通过测试 X
是否是 DataFrame 来完成,如果不是,我们将其更改为 DataFrame。
这样代码运行正确,
class MyDecorrelator(BaseEstimator, TransformerMixin):
def __init__(self, threshold):
self.threshold = threshold
self.correlated_columns = None
def fit(self, X, y=None):
correlated_features = set()
X = pd.DataFrame(X)
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
self.correlated_features = correlated_features
return self
def transform(self, X, y=None, **kwargs):
return (pd.DataFrame(X)).drop(labels=self.correlated_features, axis=1)