为什么我所有的回归器的准确性都比我所有的分类器低得多?
Why do all my Regressors show much lower Accuracy than all my Classifiers?
我正在测试下面的一些示例代码。所有分类结果都非常合理(80% 或更多)。所有的回归结果都很糟糕,而且非常不正常(大约 20%)。为什么会这样?我一定是做错了什么,但我看不出这里有什么问题。
import pandas as pd
import numpy as np
#reading the dataset
df=pd.read_csv("C:\my_path\train.csv")
#filling missing values
df['Gender'].fillna('Male', inplace=True)
df.fillna(0)
df.Loan_Status.replace(('Y', 'N'), (1, 0), inplace=True)
#split dataset into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, random_state=0)
x_train=train.drop(['Loan_Status','Loan_ID'],axis=1)
y_train=train['Loan_Status']
x_test=test.drop(['Loan_Status','Loan_ID'],axis=1)
y_test=test['Loan_Status']
#create dummies
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)
# Baggin Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)
# Bagging Regressor
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(learning_rate=0.01,random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
model= GradientBoostingRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# XGBClassifier
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# XGBRegressor
import xgboost as xgb
model=xgb.XGBRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
示例数据来自下面的link。
https://www.kaggle.com/wendykan/lending-club-loan-data
最后,这是我所看到的一小部分样本。
# Bagging Regressor
from sklearn.ensemble import BaggingRegressor
regressor = BaggingRegressor()
regressor.fit(x_train,y_train)
accuracy = regressor.score(x_test,y_test)
print(accuracy*100,'%')
# result:
13.022388059701505 %
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)
accuracy = regressor.score(x_test,y_test)
print(accuracy*100,'%')
# result:
29.836209522493196 %
回归和分类是两个不同的任务。从您的代码看来,您似乎正在尝试使用与分类器相同的数据来拟合回归器。基本上回归器试图找到一个函数,该函数可以根据输入最好地猜测输出数字。所以目标值应该是来自连续 space 的数字,而不是类别。例如,您可能希望根据借款人借入的金额来预测借款人的收入。
检查 this medium page 以了解有关回归和分类之间差异的更多信息。
我正在测试下面的一些示例代码。所有分类结果都非常合理(80% 或更多)。所有的回归结果都很糟糕,而且非常不正常(大约 20%)。为什么会这样?我一定是做错了什么,但我看不出这里有什么问题。
import pandas as pd
import numpy as np
#reading the dataset
df=pd.read_csv("C:\my_path\train.csv")
#filling missing values
df['Gender'].fillna('Male', inplace=True)
df.fillna(0)
df.Loan_Status.replace(('Y', 'N'), (1, 0), inplace=True)
#split dataset into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, random_state=0)
x_train=train.drop(['Loan_Status','Loan_ID'],axis=1)
y_train=train['Loan_Status']
x_test=test.drop(['Loan_Status','Loan_ID'],axis=1)
y_test=test['Loan_Status']
#create dummies
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)
# Baggin Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)
# Bagging Regressor
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(learning_rate=0.01,random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
model= GradientBoostingRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# XGBClassifier
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# XGBRegressor
import xgboost as xgb
model=xgb.XGBRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
示例数据来自下面的link。
https://www.kaggle.com/wendykan/lending-club-loan-data
最后,这是我所看到的一小部分样本。
# Bagging Regressor
from sklearn.ensemble import BaggingRegressor
regressor = BaggingRegressor()
regressor.fit(x_train,y_train)
accuracy = regressor.score(x_test,y_test)
print(accuracy*100,'%')
# result:
13.022388059701505 %
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)
accuracy = regressor.score(x_test,y_test)
print(accuracy*100,'%')
# result:
29.836209522493196 %
回归和分类是两个不同的任务。从您的代码看来,您似乎正在尝试使用与分类器相同的数据来拟合回归器。基本上回归器试图找到一个函数,该函数可以根据输入最好地猜测输出数字。所以目标值应该是来自连续 space 的数字,而不是类别。例如,您可能希望根据借款人借入的金额来预测借款人的收入。
检查 this medium page 以了解有关回归和分类之间差异的更多信息。