在特定列和 return 数据框的管道中使用 sklearn `KBinsDiscretizer`
Use sklearn `KBinsDiscretizer` within a pipeline on specific columns and return a data frame
我需要将 KBinsDiscretizer
作为 sklearn.pipeline
中的一个步骤仅在特定列上应用,return 将其作为 pandas 数据框应用,如下所示:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
class PandasColumnTransformer(ColumnTransformer):
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
return pd.DataFrame(super().fit_transform(X), columns=X.columns, index=X.index)
class PandasKBinsDiscretizer(KBinsDiscretizer):
def __init__(self, n_bins):
super(PandasKBinsDiscretizer, self).__init__(n_bins, encode='ordinal')
def transform(self, X):
self.col_names = list(X.columns.values)
X = super(PandasKBinsDiscretizer, self).transform(X)
X = pd.DataFrame(X, columns=self.col_names)
return X
binner_on_numeric = PandasColumnTransformer(transformers=[
("binner", PandasKBinsDiscretizer(2), 'numeric_col_to_change')])
pp = Pipeline([('binner_just_numeric', binner_on_numeric)])
d = {'numeric_col_not_to_change': [1, 2, 1, 2, 1, 2],
'numeric_col_to_change': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(data=d)
res = pp.fit_transform(df)
assert isinstance(res, pd.DataFrame)
我收到以下错误:
ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.
任何帮助我们都会很棒!
出现此错误是因为您在 ColumnTransformer
中选择了一个元素。您可以使用列表 ['numeric_col_to_change']
.
将其更改为二维数组
您还可以使用 remainder
参数指定要如何处理 ColumnTransformer
未处理的元素。 remainder='passthrough'
将简单地 return 它们原样而不是删除它们。
这应该有效:
binner_on_numeric = PandasColumnTransformer(transformers=[
("binner", PandasKBinsDiscretizer(2), ['numeric_col_to_change'])]
,remainder='passthrough')
res = pp.fit_transform(df)
将 return 以下数据帧:
numeric_col_not_to_change numeric_col_to_change
0 0.0 1.0
1 0.0 2.0
2 0.0 1.0
3 1.0 2.0
4 1.0 1.0
5 1.0 2.0
约沙法特·安托万
我正试图解决同样的问题,我看到了你的问题和答案。我想知道这段代码是否也解决了你的问题(没有 类)=)
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
X = np.array([[-2, 1, -4, -1],
[-1, 2, -3, -0.5],
[ 0, 3, -2, 0.5],
[ 1, 4, -1, 2]])
df_X = pd.DataFrame(X)
df_X.columns = ['f0', 'f1', 'f2', 'f3']
num_cols = ['f0', 'f1']
binner_on_numeric2 = ColumnTransformer(transformers=[
("binner", KBinsDiscretizer(n_bins=3, strategy='uniform', encode='ordinal'), num_cols)],
remainder='passthrough')
pp = Pipeline([('binner_just_numeric2', binner_on_numeric2)])
res = pp.fit_transform(df_X)
#assert isinstance(res, pd.DataFrame)
res
非常感谢。
亲切的问候 =),
大卫
我需要将 KBinsDiscretizer
作为 sklearn.pipeline
中的一个步骤仅在特定列上应用,return 将其作为 pandas 数据框应用,如下所示:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
class PandasColumnTransformer(ColumnTransformer):
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
return pd.DataFrame(super().fit_transform(X), columns=X.columns, index=X.index)
class PandasKBinsDiscretizer(KBinsDiscretizer):
def __init__(self, n_bins):
super(PandasKBinsDiscretizer, self).__init__(n_bins, encode='ordinal')
def transform(self, X):
self.col_names = list(X.columns.values)
X = super(PandasKBinsDiscretizer, self).transform(X)
X = pd.DataFrame(X, columns=self.col_names)
return X
binner_on_numeric = PandasColumnTransformer(transformers=[
("binner", PandasKBinsDiscretizer(2), 'numeric_col_to_change')])
pp = Pipeline([('binner_just_numeric', binner_on_numeric)])
d = {'numeric_col_not_to_change': [1, 2, 1, 2, 1, 2],
'numeric_col_to_change': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(data=d)
res = pp.fit_transform(df)
assert isinstance(res, pd.DataFrame)
我收到以下错误:
ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.
任何帮助我们都会很棒!
出现此错误是因为您在 ColumnTransformer
中选择了一个元素。您可以使用列表 ['numeric_col_to_change']
.
您还可以使用 remainder
参数指定要如何处理 ColumnTransformer
未处理的元素。 remainder='passthrough'
将简单地 return 它们原样而不是删除它们。
这应该有效:
binner_on_numeric = PandasColumnTransformer(transformers=[
("binner", PandasKBinsDiscretizer(2), ['numeric_col_to_change'])]
,remainder='passthrough')
res = pp.fit_transform(df)
将 return 以下数据帧:
numeric_col_not_to_change numeric_col_to_change
0 0.0 1.0
1 0.0 2.0
2 0.0 1.0
3 1.0 2.0
4 1.0 1.0
5 1.0 2.0
约沙法特·安托万
我正试图解决同样的问题,我看到了你的问题和答案。我想知道这段代码是否也解决了你的问题(没有 类)=)
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
X = np.array([[-2, 1, -4, -1],
[-1, 2, -3, -0.5],
[ 0, 3, -2, 0.5],
[ 1, 4, -1, 2]])
df_X = pd.DataFrame(X)
df_X.columns = ['f0', 'f1', 'f2', 'f3']
num_cols = ['f0', 'f1']
binner_on_numeric2 = ColumnTransformer(transformers=[
("binner", KBinsDiscretizer(n_bins=3, strategy='uniform', encode='ordinal'), num_cols)],
remainder='passthrough')
pp = Pipeline([('binner_just_numeric2', binner_on_numeric2)])
res = pp.fit_transform(df_X)
#assert isinstance(res, pd.DataFrame)
res
非常感谢。
亲切的问候 =),
大卫