如何在 sklearn 中并行化多个模型构建过程
How to parallelize multiple model-building procedures in sklearn
有没有办法在 scikit-learn 中并行化多个模型构建过程?我知道我可以在 GridSearchCV
和 cross_validate
中使用 n_jobs
参数来在一个模型构建过程 中实现某种并行化 。但是,我 运行 在具有不同输入参数的 for 循环中使用多个模型构建程序,并将结果保存在列表中。举个例子,假设我有 15 个空闲 CPU,并且我在 cross_validate
中使用 n_jobs=5
。如果我没记错的话,这意味着一个模型构建过程使用 5 个 CPU。现在有没有办法在我的 for 循环中启动接下来的 2 个模型构建过程,以便我使用所有 15 个 CPU?这是一个虚拟示例:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
# load breast cancer data set
X,y = load_breast_cancer(return_X_y=True)
# define different types of penalty strategies
# let's make a toy example and pretend we would be interested in
# running different penalty strategies (I use three times 'l2' here,
# but imagine these would be different)
penalty_types = ['l2','l2','l2']
# define output list where we add the results using different penalty strategies
nested_cv_scores_list = []
for penalty_type in penalty_types:
# create a random number generator
rng = np.random.RandomState(42)
# z-standardize features
scaler = StandardScaler()
# use linear L2-regularized Logistic Regression as classifier
lr = LogisticRegression(random_state=rng,penalty=penalty_type)
# define parameter grid to optimize over (optimize C)
lr_c = np.linspace(start=1,stop=16,num=11,endpoint=True)
p_grid = {'lr__C':lr_c}
# create pipeline
lr_pipe = Pipeline([
('scaler',scaler),
('lr',lr)
])
# define cross validation strategy
cv = KFold(shuffle=True,random_state=rng)
# implement GridSearch (inner cross validation)
grid = GridSearchCV(lr_pipe,param_grid=p_grid,cv=cv)
# implement cross_validate (outer cross validation)
nested_cv_scores = cross_validate(grid,X,y,cv=cv,n_jobs=5)
# append result to list
nested_cv_scores_list.append(nested_cv_scores)
有没有办法并行化这个 for 循环?
joblib.parallel
就是为这份工作而生!只需将循环内容放入一个函数中,然后使用 Parallel
和 delayed
调用它
from joblib.parallel import Parallel, delayed
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
# load breast cancer data set
X,y = load_breast_cancer(return_X_y=True)
# define different types of penalty strategies
# let's make a toy example and pretend we would be interested in
# running different penalty strategies (I use three times 'l2' here,
# but imagine these would be different)
penalty_types = ['l2','l2','l2']
# define output list where we add the results using different penalty strategies
nested_cv_scores_list = []
# put rng-seed outside of loop so that not all results are the same
rng = np.random.RandomState(42)
def run_as_job(penalty_type, X, y):
# create a random number generator
# z-standardize features
scaler = StandardScaler()
# use linear L2-regularized Logistic Regression as classifier
lr = LogisticRegression(random_state=rng,penalty=penalty_type)
# define parameter grid to optimize over (optimize C)
lr_c = np.linspace(start=1,stop=16,num=11,endpoint=True)
p_grid = {'lr__C':lr_c}
.... # additional calculation that is missing in the example
.... # e.g. res = cross_val_score(clf, X, y, n_jobs=2)
return res
if __name__ == '__main__':
results = Parallel(n_jobs=2)(delayed(run_as_job)(penalty_type) for penalty_type in penalty_types)
有关更多使用选项,请查看 joblib: Embarrassingly parallel for loops
有没有办法在 scikit-learn 中并行化多个模型构建过程?我知道我可以在 GridSearchCV
和 cross_validate
中使用 n_jobs
参数来在一个模型构建过程 中实现某种并行化 。但是,我 运行 在具有不同输入参数的 for 循环中使用多个模型构建程序,并将结果保存在列表中。举个例子,假设我有 15 个空闲 CPU,并且我在 cross_validate
中使用 n_jobs=5
。如果我没记错的话,这意味着一个模型构建过程使用 5 个 CPU。现在有没有办法在我的 for 循环中启动接下来的 2 个模型构建过程,以便我使用所有 15 个 CPU?这是一个虚拟示例:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
# load breast cancer data set
X,y = load_breast_cancer(return_X_y=True)
# define different types of penalty strategies
# let's make a toy example and pretend we would be interested in
# running different penalty strategies (I use three times 'l2' here,
# but imagine these would be different)
penalty_types = ['l2','l2','l2']
# define output list where we add the results using different penalty strategies
nested_cv_scores_list = []
for penalty_type in penalty_types:
# create a random number generator
rng = np.random.RandomState(42)
# z-standardize features
scaler = StandardScaler()
# use linear L2-regularized Logistic Regression as classifier
lr = LogisticRegression(random_state=rng,penalty=penalty_type)
# define parameter grid to optimize over (optimize C)
lr_c = np.linspace(start=1,stop=16,num=11,endpoint=True)
p_grid = {'lr__C':lr_c}
# create pipeline
lr_pipe = Pipeline([
('scaler',scaler),
('lr',lr)
])
# define cross validation strategy
cv = KFold(shuffle=True,random_state=rng)
# implement GridSearch (inner cross validation)
grid = GridSearchCV(lr_pipe,param_grid=p_grid,cv=cv)
# implement cross_validate (outer cross validation)
nested_cv_scores = cross_validate(grid,X,y,cv=cv,n_jobs=5)
# append result to list
nested_cv_scores_list.append(nested_cv_scores)
有没有办法并行化这个 for 循环?
joblib.parallel
就是为这份工作而生!只需将循环内容放入一个函数中,然后使用 Parallel
和 delayed
from joblib.parallel import Parallel, delayed
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
# load breast cancer data set
X,y = load_breast_cancer(return_X_y=True)
# define different types of penalty strategies
# let's make a toy example and pretend we would be interested in
# running different penalty strategies (I use three times 'l2' here,
# but imagine these would be different)
penalty_types = ['l2','l2','l2']
# define output list where we add the results using different penalty strategies
nested_cv_scores_list = []
# put rng-seed outside of loop so that not all results are the same
rng = np.random.RandomState(42)
def run_as_job(penalty_type, X, y):
# create a random number generator
# z-standardize features
scaler = StandardScaler()
# use linear L2-regularized Logistic Regression as classifier
lr = LogisticRegression(random_state=rng,penalty=penalty_type)
# define parameter grid to optimize over (optimize C)
lr_c = np.linspace(start=1,stop=16,num=11,endpoint=True)
p_grid = {'lr__C':lr_c}
.... # additional calculation that is missing in the example
.... # e.g. res = cross_val_score(clf, X, y, n_jobs=2)
return res
if __name__ == '__main__':
results = Parallel(n_jobs=2)(delayed(run_as_job)(penalty_type) for penalty_type in penalty_types)
有关更多使用选项,请查看 joblib: Embarrassingly parallel for loops