如何在 python 中使用 pipeline 和 ColumnTransformer?
How to use pipeline and ColumnTransformer in python?
我正在尝试 运行 python sklearn 中 this 数据集的线性回归。我想估算 NaN 值,在三个特定的列中,我想用 NaN 替换零,这样我也可以估算这些值。注意:还有其他带零的列不需要估算。
关于推算我想参考ColumnTransformer and then use pipeline to run the linear regression. I used this篇文章。但是,我收到一个错误:ValueError: Specifying the columns using strings is only supported for pandas DataFrames
有人可以验证我的过程,如果我做错了什么让我知道吗?
df[['percentage_expenditure', 'income_composition_of_resources', 'schooling']] = df[['percentage_expenditure', 'income_composition_of_resources', 'schooling']].replace(0, nan)
# Split dataframe into input and output variables.
x = df.iloc[:,df.columns != 'life_expectancy']
y = df.iloc[:, 3] # life_expectancy is the output column (variable)
# Split into train and test datasets.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=40)
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Build transformers OnHotEncoder and SimpleImputer for categorical and numeric columns respectively.
categorical_features = ['country', 'year', 'status']
numerical_features = ['adult_mortality', 'alcohol', 'percentage_expenditure',
'hepatitis_b', 'bmi', 'polio',
'total_expenditure', 'diphtheria', 'gdp',
'population', 'thinness_1-19_years',
'income_composition_of_resources',
'income_composition_of_resources', 'schooling']
transformers = [
('onehot', OneHotEncoder(), categorical_features),
('impute', SimpleImputer(strategy='median'), numerical_features)
]
column_transformer = ColumnTransformer(transformers, remainder='passthrough')
X_train_transformed = column_transformer.fit_transform(X_train)
column_transformer.named_transformers_['onehot'].get_feature_names()
# X_train_transformed
lr = LinearRegression()
pipe = Pipeline([
('preprocessing', column_transformer),
('lr', lr)
])
# edit this part to X_train instead of X_train_transformed
pipe.fit(X_train, y_train)
# Still have an error. See below
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-56-f4e9a8bf0d7a> in <module>
72 ('lr', lr)
73 ])
---> 74 pipe.fit(X_train, y_train)
75
76
/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight)
503
504 n_jobs_ = self.n_jobs
--> 505 X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
506 y_numeric=True, multi_output=True)
507
/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
430 y = check_array(y, **check_y_params)
431 else:
--> 432 X, y = check_X_y(X, y, **check_params)
433 out = X, y
434
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
793 raise ValueError("y cannot be None")
794
--> 795 X = check_array(X, accept_sparse=accept_sparse,
796 accept_large_sparse=accept_large_sparse,
797 dtype=dtype, order=order, copy=copy,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
573 if sp.issparse(array):
574 _ensure_no_complex_data(array)
--> 575 array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
576 dtype=dtype, copy=copy,
577 force_all_finite=force_all_finite,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
382 % spmatrix.format, stacklevel=2)
383 else:
--> 384 _assert_all_finite(spmatrix.data,
385 allow_nan=force_all_finite == 'allow-nan')
386
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
94 not allow_nan and not np.isfinite(X).all()):
95 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 96 raise ValueError(
97 msg_err.format
98 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
您遇到了缺失值的问题。我检查了您的数据集:您的目标变量 ("life_expectancy") 中有 10 个缺失值。您需要删除这些行:df = df.dropna(subset=["Life expectancy"])
。此外,您在未估算的列中缺少值(例如“瘦 5-9 岁”)。通过这些修改,它在我的电脑上运行良好。
我正在尝试 运行 python sklearn 中 this 数据集的线性回归。我想估算 NaN 值,在三个特定的列中,我想用 NaN 替换零,这样我也可以估算这些值。注意:还有其他带零的列不需要估算。
关于推算我想参考ColumnTransformer and then use pipeline to run the linear regression. I used this篇文章。但是,我收到一个错误:ValueError: Specifying the columns using strings is only supported for pandas DataFrames
有人可以验证我的过程,如果我做错了什么让我知道吗?
df[['percentage_expenditure', 'income_composition_of_resources', 'schooling']] = df[['percentage_expenditure', 'income_composition_of_resources', 'schooling']].replace(0, nan)
# Split dataframe into input and output variables.
x = df.iloc[:,df.columns != 'life_expectancy']
y = df.iloc[:, 3] # life_expectancy is the output column (variable)
# Split into train and test datasets.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=40)
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Build transformers OnHotEncoder and SimpleImputer for categorical and numeric columns respectively.
categorical_features = ['country', 'year', 'status']
numerical_features = ['adult_mortality', 'alcohol', 'percentage_expenditure',
'hepatitis_b', 'bmi', 'polio',
'total_expenditure', 'diphtheria', 'gdp',
'population', 'thinness_1-19_years',
'income_composition_of_resources',
'income_composition_of_resources', 'schooling']
transformers = [
('onehot', OneHotEncoder(), categorical_features),
('impute', SimpleImputer(strategy='median'), numerical_features)
]
column_transformer = ColumnTransformer(transformers, remainder='passthrough')
X_train_transformed = column_transformer.fit_transform(X_train)
column_transformer.named_transformers_['onehot'].get_feature_names()
# X_train_transformed
lr = LinearRegression()
pipe = Pipeline([
('preprocessing', column_transformer),
('lr', lr)
])
# edit this part to X_train instead of X_train_transformed
pipe.fit(X_train, y_train)
# Still have an error. See below
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-56-f4e9a8bf0d7a> in <module>
72 ('lr', lr)
73 ])
---> 74 pipe.fit(X_train, y_train)
75
76
/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight)
503
504 n_jobs_ = self.n_jobs
--> 505 X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
506 y_numeric=True, multi_output=True)
507
/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
430 y = check_array(y, **check_y_params)
431 else:
--> 432 X, y = check_X_y(X, y, **check_params)
433 out = X, y
434
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
793 raise ValueError("y cannot be None")
794
--> 795 X = check_array(X, accept_sparse=accept_sparse,
796 accept_large_sparse=accept_large_sparse,
797 dtype=dtype, order=order, copy=copy,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
573 if sp.issparse(array):
574 _ensure_no_complex_data(array)
--> 575 array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
576 dtype=dtype, copy=copy,
577 force_all_finite=force_all_finite,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
382 % spmatrix.format, stacklevel=2)
383 else:
--> 384 _assert_all_finite(spmatrix.data,
385 allow_nan=force_all_finite == 'allow-nan')
386
/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
94 not allow_nan and not np.isfinite(X).all()):
95 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 96 raise ValueError(
97 msg_err.format
98 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
您遇到了缺失值的问题。我检查了您的数据集:您的目标变量 ("life_expectancy") 中有 10 个缺失值。您需要删除这些行:df = df.dropna(subset=["Life expectancy"])
。此外,您在未估算的列中缺少值(例如“瘦 5-9 岁”)。通过这些修改,它在我的电脑上运行良好。