如何使用 foreach 函数进行并行编码?
How can I use foreach function for parallel coding?
根据 Jared Lander 的教科书 "R for Everyone"(第 19 章),我一直在尝试使用 Elastic Net。
教科书使用下面的"foreach"代码,使用并行编码找到最优参数值。
但是,即使我编写了和 运行 完全相同的代码,生成的对象 "acsDouble" 也不是一个包含 11 个 cv.glmnet 对象实例的列表。相反,它是一个空白列表。
在运行之前已经检查了教材代码,也清理了环境,但问题并没有解决。
这似乎是什么问题?
acs <- read.table("http://jaredlander.com/data/acs_ny.csv", sep=",",
header=TRUE, stringsAsFactors = FALSE)
require(useful)
# make a binary Income variable for building a logistic regression
acs$Income <- with(acs, FamilyIncome >= 150000)
# build predictor matrix
# do not include the intercept as glmnet will add that automatically
acsX <- build.x(Income ~ NumBedrooms + NumChildren + NumPeople +
NumRooms + NumUnits + NumVehicles + NumWorkers +
OwnRent + YearBuilt + ElectricBill + FoodStamp +
HeatingFuel + Insurance + Language - 1,
data=acs, contrasts = FALSE)
# build response predictor
acsY <- build.y(Income ~ NumBedrooms + NumChildren + NumPeople +
NumRooms + NumUnits + NumVehicles + NumWorkers +
OwnRent + YearBuilt + ElectricBill + FoodStamp +
HeatingFuel + Insurance + Language - 1, data=acs)
require(glmnet)
require(parallel)
require(doParallel)
# set the seed for repeatability of random results
set.seed(2834673)
# create folds, we want observations to be in the same fold each time
# it is run
theFolds <- sample(rep(x = 1:5, length.out = nrow(acsX)))
# make sequence of alpha values
alphas <- seq(from = 0.5, to = 1, by = 0.05)
# set the seed for the repeatbility of random results
set.seed(5127151)
# start a cluster with two workers
cl <- makeCluster(2)
# regiser the workers
registerDoParallel(cl)
# keep track of timing
before <- Sys.time()
# build foreach loop to run in parallel
## several arguments
acsDouble <- foreach(i=1:length(alphas), .errorhandling = "remove",
.inorder = FALSE, .multicombine = TRUE,
.export = c("acsX", "acsY", "alphas", "theFolds"),
.packages = "glmnet") %dopar%
{
print(alphas[i])
cv.glmnet(x=acsX, y=acsY, family="binamial", nfolds=5,
foldid = theFolds, alpha = alphas[i])
}
# stop timing
after <- Sys.time()
# make sure to stop the cluster when done
stopCluster(cl)
# time difference
# this will depend on speed, memory & number of cores of the machine
after - before
您的 cv.glmnet
电话中有错字。应该是family="binomial"
;不是二叉树。
在您的 foreach 循环中包含 .verbose=TRUE
将显示您是否从集群节点收到任何错误。
根据 Jared Lander 的教科书 "R for Everyone"(第 19 章),我一直在尝试使用 Elastic Net。 教科书使用下面的"foreach"代码,使用并行编码找到最优参数值。 但是,即使我编写了和 运行 完全相同的代码,生成的对象 "acsDouble" 也不是一个包含 11 个 cv.glmnet 对象实例的列表。相反,它是一个空白列表。
在运行之前已经检查了教材代码,也清理了环境,但问题并没有解决。 这似乎是什么问题?
acs <- read.table("http://jaredlander.com/data/acs_ny.csv", sep=",",
header=TRUE, stringsAsFactors = FALSE)
require(useful)
# make a binary Income variable for building a logistic regression
acs$Income <- with(acs, FamilyIncome >= 150000)
# build predictor matrix
# do not include the intercept as glmnet will add that automatically
acsX <- build.x(Income ~ NumBedrooms + NumChildren + NumPeople +
NumRooms + NumUnits + NumVehicles + NumWorkers +
OwnRent + YearBuilt + ElectricBill + FoodStamp +
HeatingFuel + Insurance + Language - 1,
data=acs, contrasts = FALSE)
# build response predictor
acsY <- build.y(Income ~ NumBedrooms + NumChildren + NumPeople +
NumRooms + NumUnits + NumVehicles + NumWorkers +
OwnRent + YearBuilt + ElectricBill + FoodStamp +
HeatingFuel + Insurance + Language - 1, data=acs)
require(glmnet)
require(parallel)
require(doParallel)
# set the seed for repeatability of random results
set.seed(2834673)
# create folds, we want observations to be in the same fold each time
# it is run
theFolds <- sample(rep(x = 1:5, length.out = nrow(acsX)))
# make sequence of alpha values
alphas <- seq(from = 0.5, to = 1, by = 0.05)
# set the seed for the repeatbility of random results
set.seed(5127151)
# start a cluster with two workers
cl <- makeCluster(2)
# regiser the workers
registerDoParallel(cl)
# keep track of timing
before <- Sys.time()
# build foreach loop to run in parallel
## several arguments
acsDouble <- foreach(i=1:length(alphas), .errorhandling = "remove",
.inorder = FALSE, .multicombine = TRUE,
.export = c("acsX", "acsY", "alphas", "theFolds"),
.packages = "glmnet") %dopar%
{
print(alphas[i])
cv.glmnet(x=acsX, y=acsY, family="binamial", nfolds=5,
foldid = theFolds, alpha = alphas[i])
}
# stop timing
after <- Sys.time()
# make sure to stop the cluster when done
stopCluster(cl)
# time difference
# this will depend on speed, memory & number of cores of the machine
after - before
您的 cv.glmnet
电话中有错字。应该是family="binomial"
;不是二叉树。
在您的 foreach 循环中包含 .verbose=TRUE
将显示您是否从集群节点收到任何错误。