如何使用 scikit-learn 创建自定义 ColumnTransformer?
How to create a custom ColumnTransformer using scikit-learn?
鉴于,我有以下数据集:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
dt = pd.DataFrame( { "time":[ "1/4/2021 0:00","1/4/2021 1:00","1/4/2021 2:00","1/4/2021 3:00","1/4/2021 4:00"],
"age":np.random.randint(12,80,5) })
我需要创建一个 自定义 ColumnTransformer
使用 scikit-learn
将数据和时间特征转换为数字特征。
这是我的习惯 ColumnTransformer
:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
return np.c_[ [self.date_and_time_to_num(x) for x in X] ]
def date_and_time_to_num(self,date_and_time):
date_and_time_in_list = date_and_time.split(" ")
date_in_seconds = self.date_to_num(date_and_time_in_list[0])
time_in_seconds = self.time_to_num(date_and_time_in_list[1])
return date_in_seconds + time_in_seconds
def date_to_num(self,date):
yy, mm, dd = map(int, date.split('/'))
return 10000 * yy + 100 * mm + dd
def time_to_num(self,time_str):
hh, mm = map(int, time_str.split(':'))
return 60 * (mm + 60 * hh)
然后我可以使用以下两个函数转换我的特征:
def process_data(x):
column_transformer = get_column_transformer()
column_transformer.fit(X=x)
return column_transformer.transform(x)
def get_column_transformer():
return make_column_transformer(
(MinMaxScaler(),dt["age"].values.tolist()),
(DateTimeTransformer(),dt["time"].values.tolist())
)
最后我调用 process_data
来应用更改:
print(process_data(dt))
但是,我遇到了以下错误:
raise ValueError(
ValueError: all features must be in [0, 1] or [-2, 0]
错误是由于make_column_transformer
将列名或列索引作为输入,而不是数据。在您的情况下,正确的语法是
make_column_transformer(
(MinMaxScaler(), ['age']),
(DateTimeTransformer(), 'time')
)
或者,等价地,
make_column_transformer(
(MinMaxScaler(), [1]),
(DateTimeTransformer(), 0)
)
对于 MinMaxScaler
你应该使用 ['age']
或 [1]
因为 MinMaxScaler
需要一个二维数组作为输入(例如 pd.DataFrame
),而对于 DateTimeTransformer
,您可以使用 'time'
或 0
,因为 DateTimeTransformer
需要一维数组作为输入(例如 pd.Series
)。 documentation.
中对此进行了解释
列名示例:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(0)
class DateTimeTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.c_[[self.date_and_time_to_num(x) for x in X]]
def date_and_time_to_num(self, date_and_time):
date_and_time_in_list = date_and_time.split(' ')
date_in_seconds = self.date_to_num(date_and_time_in_list[0])
time_in_seconds = self.time_to_num(date_and_time_in_list[1])
return date_in_seconds + time_in_seconds
def date_to_num(self, date):
yy, mm, dd = map(int, date.split('/'))
return 10000 * yy + 100 * mm + dd
def time_to_num(self, time_str):
hh, mm = map(int, time_str.split(':'))
return 60 * (mm + 60 * hh)
def process_data(x):
column_transformer = get_column_transformer()
column_transformer.fit(X=x)
return column_transformer.transform(x)
def get_column_transformer():
return make_column_transformer(
(MinMaxScaler(), ['age']),
(DateTimeTransformer(), 'time')
)
df = pd.DataFrame({
'time': ['1/4/2021 0:00', '1/4/2021 1:00', '1/4/2021 2:00', '1/4/2021 3:00', '1/4/2021 4:00'],
'age': np.random.randint(12, 80, 5)
})
process_data(df)
# array([[0.00000000e+00, 1.24210000e+04],
# [1.30434783e-01, 1.60210000e+04],
# [8.69565217e-01, 1.96210000e+04],
# [1.00000000e+00, 2.32210000e+04],
# [1.00000000e+00, 2.68210000e+04]])
列索引示例:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(0)
class DateTimeTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.c_[[self.date_and_time_to_num(x) for x in X]]
def date_and_time_to_num(self, date_and_time):
date_and_time_in_list = date_and_time.split(' ')
date_in_seconds = self.date_to_num(date_and_time_in_list[0])
time_in_seconds = self.time_to_num(date_and_time_in_list[1])
return date_in_seconds + time_in_seconds
def date_to_num(self, date):
yy, mm, dd = map(int, date.split('/'))
return 10000 * yy + 100 * mm + dd
def time_to_num(self, time_str):
hh, mm = map(int, time_str.split(':'))
return 60 * (mm + 60 * hh)
def process_data(x):
column_transformer = get_column_transformer()
column_transformer.fit(X=x)
return column_transformer.transform(x)
def get_column_transformer():
return make_column_transformer(
(MinMaxScaler(), [1]),
(DateTimeTransformer(), 0)
)
df = pd.DataFrame({
'time': ['1/4/2021 0:00', '1/4/2021 1:00', '1/4/2021 2:00', '1/4/2021 3:00', '1/4/2021 4:00'],
'age': np.random.randint(12, 80, 5)
})
process_data(df)
# array([[0.00000000e+00, 1.24210000e+04],
# [1.30434783e-01, 1.60210000e+04],
# [8.69565217e-01, 1.96210000e+04],
# [1.00000000e+00, 2.32210000e+04],
# [1.00000000e+00, 2.68210000e+04]])
鉴于,我有以下数据集:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
dt = pd.DataFrame( { "time":[ "1/4/2021 0:00","1/4/2021 1:00","1/4/2021 2:00","1/4/2021 3:00","1/4/2021 4:00"],
"age":np.random.randint(12,80,5) })
我需要创建一个 自定义 ColumnTransformer
使用 scikit-learn
将数据和时间特征转换为数字特征。
这是我的习惯 ColumnTransformer
:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
return np.c_[ [self.date_and_time_to_num(x) for x in X] ]
def date_and_time_to_num(self,date_and_time):
date_and_time_in_list = date_and_time.split(" ")
date_in_seconds = self.date_to_num(date_and_time_in_list[0])
time_in_seconds = self.time_to_num(date_and_time_in_list[1])
return date_in_seconds + time_in_seconds
def date_to_num(self,date):
yy, mm, dd = map(int, date.split('/'))
return 10000 * yy + 100 * mm + dd
def time_to_num(self,time_str):
hh, mm = map(int, time_str.split(':'))
return 60 * (mm + 60 * hh)
然后我可以使用以下两个函数转换我的特征:
def process_data(x):
column_transformer = get_column_transformer()
column_transformer.fit(X=x)
return column_transformer.transform(x)
def get_column_transformer():
return make_column_transformer(
(MinMaxScaler(),dt["age"].values.tolist()),
(DateTimeTransformer(),dt["time"].values.tolist())
)
最后我调用 process_data
来应用更改:
print(process_data(dt))
但是,我遇到了以下错误:
raise ValueError(
ValueError: all features must be in [0, 1] or [-2, 0]
错误是由于make_column_transformer
将列名或列索引作为输入,而不是数据。在您的情况下,正确的语法是
make_column_transformer(
(MinMaxScaler(), ['age']),
(DateTimeTransformer(), 'time')
)
或者,等价地,
make_column_transformer(
(MinMaxScaler(), [1]),
(DateTimeTransformer(), 0)
)
对于 MinMaxScaler
你应该使用 ['age']
或 [1]
因为 MinMaxScaler
需要一个二维数组作为输入(例如 pd.DataFrame
),而对于 DateTimeTransformer
,您可以使用 'time'
或 0
,因为 DateTimeTransformer
需要一维数组作为输入(例如 pd.Series
)。 documentation.
列名示例:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(0)
class DateTimeTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.c_[[self.date_and_time_to_num(x) for x in X]]
def date_and_time_to_num(self, date_and_time):
date_and_time_in_list = date_and_time.split(' ')
date_in_seconds = self.date_to_num(date_and_time_in_list[0])
time_in_seconds = self.time_to_num(date_and_time_in_list[1])
return date_in_seconds + time_in_seconds
def date_to_num(self, date):
yy, mm, dd = map(int, date.split('/'))
return 10000 * yy + 100 * mm + dd
def time_to_num(self, time_str):
hh, mm = map(int, time_str.split(':'))
return 60 * (mm + 60 * hh)
def process_data(x):
column_transformer = get_column_transformer()
column_transformer.fit(X=x)
return column_transformer.transform(x)
def get_column_transformer():
return make_column_transformer(
(MinMaxScaler(), ['age']),
(DateTimeTransformer(), 'time')
)
df = pd.DataFrame({
'time': ['1/4/2021 0:00', '1/4/2021 1:00', '1/4/2021 2:00', '1/4/2021 3:00', '1/4/2021 4:00'],
'age': np.random.randint(12, 80, 5)
})
process_data(df)
# array([[0.00000000e+00, 1.24210000e+04],
# [1.30434783e-01, 1.60210000e+04],
# [8.69565217e-01, 1.96210000e+04],
# [1.00000000e+00, 2.32210000e+04],
# [1.00000000e+00, 2.68210000e+04]])
列索引示例:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(0)
class DateTimeTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.c_[[self.date_and_time_to_num(x) for x in X]]
def date_and_time_to_num(self, date_and_time):
date_and_time_in_list = date_and_time.split(' ')
date_in_seconds = self.date_to_num(date_and_time_in_list[0])
time_in_seconds = self.time_to_num(date_and_time_in_list[1])
return date_in_seconds + time_in_seconds
def date_to_num(self, date):
yy, mm, dd = map(int, date.split('/'))
return 10000 * yy + 100 * mm + dd
def time_to_num(self, time_str):
hh, mm = map(int, time_str.split(':'))
return 60 * (mm + 60 * hh)
def process_data(x):
column_transformer = get_column_transformer()
column_transformer.fit(X=x)
return column_transformer.transform(x)
def get_column_transformer():
return make_column_transformer(
(MinMaxScaler(), [1]),
(DateTimeTransformer(), 0)
)
df = pd.DataFrame({
'time': ['1/4/2021 0:00', '1/4/2021 1:00', '1/4/2021 2:00', '1/4/2021 3:00', '1/4/2021 4:00'],
'age': np.random.randint(12, 80, 5)
})
process_data(df)
# array([[0.00000000e+00, 1.24210000e+04],
# [1.30434783e-01, 1.60210000e+04],
# [8.69565217e-01, 1.96210000e+04],
# [1.00000000e+00, 2.32210000e+04],
# [1.00000000e+00, 2.68210000e+04]])