如何将 - 使用 for 循环调用多个函数 - 更改为 - 使用管道调用 class?
How do I change - using for loops to call multiple functions - into - using a pipeline to call a class?
所以基本要求是,我从用户那里得到一个模型字典,一个他们的超参数字典,并给出一个报告。目前的目标是二进制 classification,但这可以在以后扩展。
这是我目前正在做的事情:
import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
def build_model(model_name, model_class, params=None):
"""
return model instance
"""
if 'Ridge' in model_name:
model = model_class(penalty='l2')
elif 'Lasso' in model_name:
model = model_class(penalty='l1')
elif 'Ensemble' in model_name:
model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
else:
model = model_class()
if params is not None:
print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
scoring=make_scorer(f1_score), error_score=0.0)
return rscv
print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
return model
def fit_model(model_name, model_instance, xTrain, yTrain):
"""
fit model
"""
if model_name == 'SVM':
scaler = StandardScaler()
model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
else:
model = model_instance.fit(xTrain, yTrain)
return model
def predict_vals(fitted_model, xTest):
"""
predict and return vals
"""
if model_name == 'SVM':
scaler = StandardScaler()
y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
else:
y_prediction = fitted_model.predict(xTest)
return y_prediction
def get_metrics(yTest, y_prediction):
"""
get metrics after getting prediction
"""
return [recall_score(yTest, y_prediction),
precision_score(yTest, y_prediction),
f1_score(yTest, y_prediction),
roc_auc_score(yTest, y_prediction)]
def model_report(list_of_metrics):
"""
add metrics to df, return df
"""
df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
df = df.round(3)
return df
models = {
'Logistic Regression Ridge': LogisticRegression,
'Logistic Regression Lasso': LogisticRegression,
'Random Forest': RandomForestClassifier,
'SVM': SVC,
'GBM': GradientBoostingClassifier,
'EnsembleRFGBM': VotingClassifier
}
model_parameters = {
'SVM': {
'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
'class_weight': ['balanced'],
'gamma': [0.0001, 0.001],
'kernel': ['linear']
},
'Random Forest': {
'n_estimators': [5, 10, 50, 100, 200],
'max_depth': [3, 5, 10, 20, 40],
'criterion': ['gini', 'entropy'],
'bootstrap': [True, False],
'min_samples_leaf': [np.random.randint(1,10)]
},
'Logistic Regression Ridge': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'Logistic Regression Lasso': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'GBM': {
'n_estimators': [10, 50, 100, 200, 500],
'max_depth': [3, 5, 10, None],
'min_samples_leaf': [np.random.randint(1,10)]
},
'EnsembleRFGBM': {
'rf__n_estimators': [5, 10, 50, 100, 200],
'rf__max_depth': [3, 5, 10, 20, 40],
'rf__min_samples_leaf': [np.random.randint(1,10)],
'gbm__n_estimators': [10, 50, 100, 200, 500],
'gbm__max_depth': [3, 5, 10, None],
'gbm__min_samples_leaf': [np.random.randint(1,10)]
}
}
没有参数我得到以下报告。
# without parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
将参数作为输入
# with parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class, model_parameters)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
现在交给我的任务如下
- 从用户那里获取模型字典及其参数。如果没有提供参数,则使用模型的默认值。
- 将报告作为输出(如图所示)
有人告诉我应该将函数更改为 classes。并尽可能避免 for 循环。
我的挑战:
- 如何将所有函数更改为 class 和方法?基本上我学长想要的是
report.getReport # gives the dataFrame of the report
但上面的内容对我来说好像可以用如下函数完成(我不明白 why/how class 会有什么好处)
customReport(whatever inputs I'd like to give) # gives df of report
- 如何避免
for loops
获取各种模型的用户输入?我想我可以使用 sklearn pipeline,因为根据我的理解,管道是一系列步骤,所以从用户那里获取参数和模型,并将它们作为一系列步骤执行。这避免了 for 循环。
像这样
customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
'SVC', SVC(with relevant params from params dict)) ] )
类似的解决方案 我发现是 here 但我想避免这样 for loops
。
另一个相关解决方案 here 是使用 class 可以在不同模型之间切换。但在这里我会要求用户能够选择他是否想做 Gridsearch/RandomizedSearch/CV/None。我的想法是,我使用这个 class,然后将其继承给另一个 class,用户可以提供输入以选择 Gridsearch/RandomizedSearch/CV/None 等。我不确定我是否在考虑正确的方向。
注意 一个完整的工作解决方案是可取的(会喜欢它)但不是强制性的。如果您的回答有一个框架可以给我一个如何进行的方向,那没关系。我可以探索并从中学习。
可以考虑使用map(),详情在这里:https://www.geeksforgeeks.org/python-map-function/
一些程序员有避免原始循环的习惯——“原始循环是函数内部的任何循环,其中函数的用途大于算法
由循环实现”。更多细节在这里:https://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf
我认为这就是要求您删除 for 循环的原因。
我已经实施了一个可行的解决方案。我应该更好地表达我的问题。我最初误解了 GridsearchCV
或 RandomizedSearchCV
在内部是如何工作的。 cv_results_
给出了所有可用的网格结果。我以为只有 best estimator
对我们可用。
使用这个,对于每种类型的模型,我取最大值 rank_test_score
,并得到构成模型的参数。在这个例子中,它是 4 个模型。现在我 运行 每个模型,即每个模型的最佳参数组合,与我的测试数据,并预测所需的分数。我认为这个解决方案可以扩展到 RandomizedSearchCV
和更多其他选项。
注意:这只是一个简单的解决方案。需要进行大量修改,例如需要为特定模型扩展数据等。此解决方案将作为一个起点,可以根据用户的需要进行修改。
感谢 this answer 的 ClfSwitcher() class
。
以下是class的实现(欢迎提出改进建议)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
class ClfSwitcher(BaseEstimator):
def __init__(self, model=RandomForestClassifier()):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.model = model
def fit(self, X, y=None, **kwargs):
self.model.fit(X, y)
return self
def predict(self, X, y=None):
return self.model.predict(X)
def predict_proba(self, X):
return self.model.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
class report(ClfSwitcher):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.grid = None
self.full_report = None
self.concise_report = None
self.scoring_metrics = {
'precision': precision_score,
'recall': recall_score,
'f1': f1_score,
'roc_auc': roc_auc_score
}
def griddy(self, pipeLine, parameters, **kwargs):
self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)
def fit_grid(self, X_train, y_train=None, **kwargs):
self.grid.fit(X_train, y_train)
def make_grid_report(self):
self.full_report = pd.DataFrame(self.grid.cv_results_)
@staticmethod
def get_names(col):
return col.__class__.__name__
@staticmethod
def calc_score(col, metric):
return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)
def make_concise_report(self):
self.concise_report = pd.DataFrame(self.grid.cv_results_)
self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
.groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
.reset_index(drop=True)
for metric_name, metric_func in self.scoring_metrics.items():
self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)
self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]
pipeline = Pipeline([
('cst', ClfSwitcher()),
])
parameters = [
{
'cst__model': [RandomForestClassifier()],
'cst__model__n_estimators': [10, 20],
'cst__model__max_depth': [5, 10],
'cst__model__criterion': ['gini', 'entropy']
},
{
'cst__model': [SVC()],
'cst__model__C': [10, 20],
'cst__model__kernel': ['linear'],
'cst__model__gamma': [0.0001, 0.001]
},
{
'cst__model': [LogisticRegression()],
'cst__model__C': [13, 17],
'cst__model__penalty': ['l1', 'l2']
},
{
'cst__model': [GradientBoostingClassifier()],
'cst__model__n_estimators': [10, 50],
'cst__model__max_depth': [3, 5],
'cst__model__min_samples_leaf': [1, 2]
}
]
my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report
根据需要输出报告。
所以基本要求是,我从用户那里得到一个模型字典,一个他们的超参数字典,并给出一个报告。目前的目标是二进制 classification,但这可以在以后扩展。
这是我目前正在做的事情:
import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
def build_model(model_name, model_class, params=None):
"""
return model instance
"""
if 'Ridge' in model_name:
model = model_class(penalty='l2')
elif 'Lasso' in model_name:
model = model_class(penalty='l1')
elif 'Ensemble' in model_name:
model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
else:
model = model_class()
if params is not None:
print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
scoring=make_scorer(f1_score), error_score=0.0)
return rscv
print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
return model
def fit_model(model_name, model_instance, xTrain, yTrain):
"""
fit model
"""
if model_name == 'SVM':
scaler = StandardScaler()
model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
else:
model = model_instance.fit(xTrain, yTrain)
return model
def predict_vals(fitted_model, xTest):
"""
predict and return vals
"""
if model_name == 'SVM':
scaler = StandardScaler()
y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
else:
y_prediction = fitted_model.predict(xTest)
return y_prediction
def get_metrics(yTest, y_prediction):
"""
get metrics after getting prediction
"""
return [recall_score(yTest, y_prediction),
precision_score(yTest, y_prediction),
f1_score(yTest, y_prediction),
roc_auc_score(yTest, y_prediction)]
def model_report(list_of_metrics):
"""
add metrics to df, return df
"""
df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
df = df.round(3)
return df
models = {
'Logistic Regression Ridge': LogisticRegression,
'Logistic Regression Lasso': LogisticRegression,
'Random Forest': RandomForestClassifier,
'SVM': SVC,
'GBM': GradientBoostingClassifier,
'EnsembleRFGBM': VotingClassifier
}
model_parameters = {
'SVM': {
'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
'class_weight': ['balanced'],
'gamma': [0.0001, 0.001],
'kernel': ['linear']
},
'Random Forest': {
'n_estimators': [5, 10, 50, 100, 200],
'max_depth': [3, 5, 10, 20, 40],
'criterion': ['gini', 'entropy'],
'bootstrap': [True, False],
'min_samples_leaf': [np.random.randint(1,10)]
},
'Logistic Regression Ridge': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'Logistic Regression Lasso': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'GBM': {
'n_estimators': [10, 50, 100, 200, 500],
'max_depth': [3, 5, 10, None],
'min_samples_leaf': [np.random.randint(1,10)]
},
'EnsembleRFGBM': {
'rf__n_estimators': [5, 10, 50, 100, 200],
'rf__max_depth': [3, 5, 10, 20, 40],
'rf__min_samples_leaf': [np.random.randint(1,10)],
'gbm__n_estimators': [10, 50, 100, 200, 500],
'gbm__max_depth': [3, 5, 10, None],
'gbm__min_samples_leaf': [np.random.randint(1,10)]
}
}
没有参数我得到以下报告。
# without parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
将参数作为输入
# with parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class, model_parameters)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
现在交给我的任务如下
- 从用户那里获取模型字典及其参数。如果没有提供参数,则使用模型的默认值。
- 将报告作为输出(如图所示)
有人告诉我应该将函数更改为 classes。并尽可能避免 for 循环。
我的挑战:
- 如何将所有函数更改为 class 和方法?基本上我学长想要的是
report.getReport # gives the dataFrame of the report
但上面的内容对我来说好像可以用如下函数完成(我不明白 why/how class 会有什么好处)
customReport(whatever inputs I'd like to give) # gives df of report
- 如何避免
for loops
获取各种模型的用户输入?我想我可以使用 sklearn pipeline,因为根据我的理解,管道是一系列步骤,所以从用户那里获取参数和模型,并将它们作为一系列步骤执行。这避免了 for 循环。
像这样
customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
'SVC', SVC(with relevant params from params dict)) ] )
类似的解决方案 我发现是 here 但我想避免这样 for loops
。
另一个相关解决方案 here 是使用 class 可以在不同模型之间切换。但在这里我会要求用户能够选择他是否想做 Gridsearch/RandomizedSearch/CV/None。我的想法是,我使用这个 class,然后将其继承给另一个 class,用户可以提供输入以选择 Gridsearch/RandomizedSearch/CV/None 等。我不确定我是否在考虑正确的方向。
注意 一个完整的工作解决方案是可取的(会喜欢它)但不是强制性的。如果您的回答有一个框架可以给我一个如何进行的方向,那没关系。我可以探索并从中学习。
可以考虑使用map(),详情在这里:https://www.geeksforgeeks.org/python-map-function/
一些程序员有避免原始循环的习惯——“原始循环是函数内部的任何循环,其中函数的用途大于算法 由循环实现”。更多细节在这里:https://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf
我认为这就是要求您删除 for 循环的原因。
我已经实施了一个可行的解决方案。我应该更好地表达我的问题。我最初误解了 GridsearchCV
或 RandomizedSearchCV
在内部是如何工作的。 cv_results_
给出了所有可用的网格结果。我以为只有 best estimator
对我们可用。
使用这个,对于每种类型的模型,我取最大值 rank_test_score
,并得到构成模型的参数。在这个例子中,它是 4 个模型。现在我 运行 每个模型,即每个模型的最佳参数组合,与我的测试数据,并预测所需的分数。我认为这个解决方案可以扩展到 RandomizedSearchCV
和更多其他选项。
注意:这只是一个简单的解决方案。需要进行大量修改,例如需要为特定模型扩展数据等。此解决方案将作为一个起点,可以根据用户的需要进行修改。
感谢 this answer 的 ClfSwitcher() class
。
以下是class的实现(欢迎提出改进建议)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
class ClfSwitcher(BaseEstimator):
def __init__(self, model=RandomForestClassifier()):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.model = model
def fit(self, X, y=None, **kwargs):
self.model.fit(X, y)
return self
def predict(self, X, y=None):
return self.model.predict(X)
def predict_proba(self, X):
return self.model.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
class report(ClfSwitcher):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.grid = None
self.full_report = None
self.concise_report = None
self.scoring_metrics = {
'precision': precision_score,
'recall': recall_score,
'f1': f1_score,
'roc_auc': roc_auc_score
}
def griddy(self, pipeLine, parameters, **kwargs):
self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)
def fit_grid(self, X_train, y_train=None, **kwargs):
self.grid.fit(X_train, y_train)
def make_grid_report(self):
self.full_report = pd.DataFrame(self.grid.cv_results_)
@staticmethod
def get_names(col):
return col.__class__.__name__
@staticmethod
def calc_score(col, metric):
return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)
def make_concise_report(self):
self.concise_report = pd.DataFrame(self.grid.cv_results_)
self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
.groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
.reset_index(drop=True)
for metric_name, metric_func in self.scoring_metrics.items():
self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)
self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]
pipeline = Pipeline([
('cst', ClfSwitcher()),
])
parameters = [
{
'cst__model': [RandomForestClassifier()],
'cst__model__n_estimators': [10, 20],
'cst__model__max_depth': [5, 10],
'cst__model__criterion': ['gini', 'entropy']
},
{
'cst__model': [SVC()],
'cst__model__C': [10, 20],
'cst__model__kernel': ['linear'],
'cst__model__gamma': [0.0001, 0.001]
},
{
'cst__model': [LogisticRegression()],
'cst__model__C': [13, 17],
'cst__model__penalty': ['l1', 'l2']
},
{
'cst__model': [GradientBoostingClassifier()],
'cst__model__n_estimators': [10, 50],
'cst__model__max_depth': [3, 5],
'cst__model__min_samples_leaf': [1, 2]
}
]
my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report
根据需要输出报告。