在 R 中使用 glm 和 cv.glmnet 预测新数据(包括交互和分类变量)
Predicting new data using glm and cv.glmnet in R (including interactions and categorical variables )
我想对包含交互作用和分类变量的回归公式建模。我有兴趣使用 glm 和 glmnet::cv.glmnet。我对适合模型的函数没意见,但不太确定我是否使用训练有素的模型来正确预测样本数据。这是一个例子。
Formula <- "Sepal.Length ~ Sepal.Width + Petal.Length + as.factor(Species):Petal.Width + Sepal.Width:Petal.Length + as.factor(Species) + bs(Petal.Width, df = 2, degree = 2)"
data("iris")
Inx <- sample( 1: nrow(iris), nrow(iris), replace = F)
iris$Species <- as.factor(iris$Species)
train_data <- iris[Inx[1:100], ]
test_data <- iris[Inx[101:nrow(iris) ],]
#---- glm -----------------
ModelMatrix <- predict(caret::dummyVars(Formula, train_data, fullRank = T, sep = ""), train_data)
glmfit <- glm(formula = as.formula(Formula) , data = train_data)
prd_glm <- predict(glmfit, newx = ModelMatrix, type = "response")
#------- glm cross validation --------------
cvglm <- glmnet::cv.glmnet(x = ModelMatrix,
y = train_data$Sepal.Length,
nfolds = 4, keep = TRUE, alpha = 1, parallel = F, type.measure = 'mse')
ModelMatrix_test <- predict(caret::dummyVars(Formula, test_data, fullRank = T, sep = ""), test_data)
prd_cvglm <- predict(cvglm, newx = ModelMatrix_test, s = "lambda.1se", type ='response')
您要么使用模型矩阵,要么使用公式,但不能同时使用两者,因为一旦您提供了公式,任何 glm 都会在内部生成模型矩阵。而且你只考虑一次。所以在你的情况下,假设直接拟合模型矩阵:
library(splines)
library(caret)
library(glmnet)
data(iris)
Inx <- sample(nrow(iris),100)
iris$Species <- factor(iris$Species)
train_data <- iris[Inx, ]
test_data <- iris[-Inx,]
Formula <- "Sepal.Length ~ Sepal.Width + Petal.Length + Species:Petal.Width + Sepal.Width:Petal.Length + Species + bs(Petal.Width, df = 2, degree = 2)"
glmfit <- glm(as.formula(Formula),data=train_data)
你可以看到这与用公式拟合是一样的:
ModelMatrix <- predict(caret::dummyVars(Formula, train_data, fullRank = T, sep = ""), train_data)
y = train_data[,"Sepal.Length"]
fit_dummy = glm(y ~ ModelMatrix)
table(fitted(glmfit) == fitted(fit_dummy))
TRUE
100
我们根据测试数据预测:
prd_glm <- predict(glmfit, newdata = test_data, type = "response")
然后对于 glmnet:
cvglm <- cv.glmnet(x = ModelMatrix,y = train_data$Sepal.Length,nfolds = 4,
keep = TRUE, alpha = 1, parallel = F, type.measure = 'mse')
ModelMatrix_test <- predict(caret::dummyVars(Formula, test_data, fullRank = T, sep = ""), test_data)
prd_cvglm <- predict(cvglm, newx = ModelMatrix_test, s = "lambda.1se", type ='response')
您可以看到它们有何不同:
plot(prd_glm,prd_cvglm)
我想对包含交互作用和分类变量的回归公式建模。我有兴趣使用 glm 和 glmnet::cv.glmnet。我对适合模型的函数没意见,但不太确定我是否使用训练有素的模型来正确预测样本数据。这是一个例子。
Formula <- "Sepal.Length ~ Sepal.Width + Petal.Length + as.factor(Species):Petal.Width + Sepal.Width:Petal.Length + as.factor(Species) + bs(Petal.Width, df = 2, degree = 2)"
data("iris")
Inx <- sample( 1: nrow(iris), nrow(iris), replace = F)
iris$Species <- as.factor(iris$Species)
train_data <- iris[Inx[1:100], ]
test_data <- iris[Inx[101:nrow(iris) ],]
#---- glm -----------------
ModelMatrix <- predict(caret::dummyVars(Formula, train_data, fullRank = T, sep = ""), train_data)
glmfit <- glm(formula = as.formula(Formula) , data = train_data)
prd_glm <- predict(glmfit, newx = ModelMatrix, type = "response")
#------- glm cross validation --------------
cvglm <- glmnet::cv.glmnet(x = ModelMatrix,
y = train_data$Sepal.Length,
nfolds = 4, keep = TRUE, alpha = 1, parallel = F, type.measure = 'mse')
ModelMatrix_test <- predict(caret::dummyVars(Formula, test_data, fullRank = T, sep = ""), test_data)
prd_cvglm <- predict(cvglm, newx = ModelMatrix_test, s = "lambda.1se", type ='response')
您要么使用模型矩阵,要么使用公式,但不能同时使用两者,因为一旦您提供了公式,任何 glm 都会在内部生成模型矩阵。而且你只考虑一次。所以在你的情况下,假设直接拟合模型矩阵:
library(splines)
library(caret)
library(glmnet)
data(iris)
Inx <- sample(nrow(iris),100)
iris$Species <- factor(iris$Species)
train_data <- iris[Inx, ]
test_data <- iris[-Inx,]
Formula <- "Sepal.Length ~ Sepal.Width + Petal.Length + Species:Petal.Width + Sepal.Width:Petal.Length + Species + bs(Petal.Width, df = 2, degree = 2)"
glmfit <- glm(as.formula(Formula),data=train_data)
你可以看到这与用公式拟合是一样的:
ModelMatrix <- predict(caret::dummyVars(Formula, train_data, fullRank = T, sep = ""), train_data)
y = train_data[,"Sepal.Length"]
fit_dummy = glm(y ~ ModelMatrix)
table(fitted(glmfit) == fitted(fit_dummy))
TRUE
100
我们根据测试数据预测:
prd_glm <- predict(glmfit, newdata = test_data, type = "response")
然后对于 glmnet:
cvglm <- cv.glmnet(x = ModelMatrix,y = train_data$Sepal.Length,nfolds = 4,
keep = TRUE, alpha = 1, parallel = F, type.measure = 'mse')
ModelMatrix_test <- predict(caret::dummyVars(Formula, test_data, fullRank = T, sep = ""), test_data)
prd_cvglm <- predict(cvglm, newx = ModelMatrix_test, s = "lambda.1se", type ='response')
您可以看到它们有何不同:
plot(prd_glm,prd_cvglm)