OneHotEncoder ValueError: Input contains NaN
OneHotEncoder ValueError: Input contains NaN
我已经下载了 this 数据,这是我的代码:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import plotly.figure_factory as ff
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
random_state = 27912
df_train = pd.read_csv("...")
df_test = pd.read_csv("...")
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(["Survived", "Ticket", "Cabin", "Name", "PassengerId"],
axis = 1),
df_train["Survived"], test_size=0.2,
random_state=42)
numeric_col_names = ["Age", "SibSp", "Parch", "Fare"]
ordinal_col_names = ["Pclass"]
one_hot_col_names = ["Embarked", "Sex"]
ct = make_column_transformer(
(SimpleImputer(strategy="median"), numeric_col_names),
(SimpleImputer(strategy="most_frequent"), ordinal_col_names + one_hot_col_names),
(OrdinalEncoder(), ordinal_col_names),
(OneHotEncoder(), one_hot_col_names),
(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
preprocessing_pipeline = Pipeline([("transformers", ct)])
preprocessing_pipeline.fit_transform(X_train)
我正在尝试 make column_transformer
进行预处理步骤,但是,OneHotEncoding
步骤给我一个错误,ValueError: Input contains NaN
。我真的不知道为什么会这样,因为我之前输入了这些值。关于为什么会发生这种情况的任何线索?
尝试这样的事情也无济于事
preprocessing_pipeline = Pipeline([("transformers", ct_first)])
ct_second = make_column_transformer((OneHotEncoder(), one_hot_col_names),(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
pipeline = Pipeline([("transformer1", preprocessing_pipeline), ("transformer2", ct_second)])
pipeline.fit_transform(X_train)
我想知道为什么会这样,为什么上面的代码,第一次和第二次尝试都不正确。
谢谢
您需要为每种列类型创建一个管道,以确保按顺序应用不同的步骤(即确保在编码和缩放之前估算缺失值),另请参阅 this example在 scikit-learn 文档中。
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
# Load the data (from https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
# Extract the features
X_train = df_train.drop(labels=['Survived', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
X_test = df_test.drop(labels=['Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
# Map the feature names to the corresponding
# types (numerical, ordinal or categorical)
numeric_col_names = ['Age', 'SibSp', 'Parch', 'Fare']
ordinal_col_names = ['Pclass']
one_hot_col_names = ['Embarked', 'Sex']
# Define the numerical features pipeline
numeric_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Define the ordinal features pipeline
ordinal_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder()),
('scaler', StandardScaler())
])
# Define the categorical features pipeline
one_hot_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(sparse=False)),
('scaler', StandardScaler())
])
# Create the overall preprocessing pipeline
preprocessing_pipeline = make_column_transformer(
(numeric_col_transformer, numeric_col_names),
(ordinal_col_transformer, ordinal_col_names),
(one_hot_col_transformer, one_hot_col_names),
)
# Fit the pipeline to the training data
preprocessing_pipeline.fit(X_train)
# Apply the pipeline to the training and test data
X_train_ = preprocessing_pipeline.transform(X_train)
X_test_ = preprocessing_pipeline.transform(X_test)
我已经下载了 this 数据,这是我的代码:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import plotly.figure_factory as ff
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
random_state = 27912
df_train = pd.read_csv("...")
df_test = pd.read_csv("...")
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(["Survived", "Ticket", "Cabin", "Name", "PassengerId"],
axis = 1),
df_train["Survived"], test_size=0.2,
random_state=42)
numeric_col_names = ["Age", "SibSp", "Parch", "Fare"]
ordinal_col_names = ["Pclass"]
one_hot_col_names = ["Embarked", "Sex"]
ct = make_column_transformer(
(SimpleImputer(strategy="median"), numeric_col_names),
(SimpleImputer(strategy="most_frequent"), ordinal_col_names + one_hot_col_names),
(OrdinalEncoder(), ordinal_col_names),
(OneHotEncoder(), one_hot_col_names),
(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
preprocessing_pipeline = Pipeline([("transformers", ct)])
preprocessing_pipeline.fit_transform(X_train)
我正在尝试 make column_transformer
进行预处理步骤,但是,OneHotEncoding
步骤给我一个错误,ValueError: Input contains NaN
。我真的不知道为什么会这样,因为我之前输入了这些值。关于为什么会发生这种情况的任何线索?
尝试这样的事情也无济于事
preprocessing_pipeline = Pipeline([("transformers", ct_first)])
ct_second = make_column_transformer((OneHotEncoder(), one_hot_col_names),(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
pipeline = Pipeline([("transformer1", preprocessing_pipeline), ("transformer2", ct_second)])
pipeline.fit_transform(X_train)
我想知道为什么会这样,为什么上面的代码,第一次和第二次尝试都不正确。 谢谢
您需要为每种列类型创建一个管道,以确保按顺序应用不同的步骤(即确保在编码和缩放之前估算缺失值),另请参阅 this example在 scikit-learn 文档中。
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
# Load the data (from https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
# Extract the features
X_train = df_train.drop(labels=['Survived', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
X_test = df_test.drop(labels=['Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
# Map the feature names to the corresponding
# types (numerical, ordinal or categorical)
numeric_col_names = ['Age', 'SibSp', 'Parch', 'Fare']
ordinal_col_names = ['Pclass']
one_hot_col_names = ['Embarked', 'Sex']
# Define the numerical features pipeline
numeric_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Define the ordinal features pipeline
ordinal_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder()),
('scaler', StandardScaler())
])
# Define the categorical features pipeline
one_hot_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(sparse=False)),
('scaler', StandardScaler())
])
# Create the overall preprocessing pipeline
preprocessing_pipeline = make_column_transformer(
(numeric_col_transformer, numeric_col_names),
(ordinal_col_transformer, ordinal_col_names),
(one_hot_col_transformer, one_hot_col_names),
)
# Fit the pipeline to the training data
preprocessing_pipeline.fit(X_train)
# Apply the pipeline to the training and test data
X_train_ = preprocessing_pipeline.transform(X_train)
X_test_ = preprocessing_pipeline.transform(X_test)