用 biglm 因子预测
Predict with biglm factors
为什么使用 biglm 进行预测与使用 lm 进行预测不同?
这是我生成数据的代码:(诚然草率)
require(data.table)
require(biganalytics)
#create a data set with factor variables and one continuous numerical variable
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
data = data.table(ded = runif(100), cont = runif(100), fac1 = as.factor(LETTERS702[1:10]))
data[, cont := as.factor(ifelse(cont < .25, 100,
ifelse(cont < .5, 200,
ifelse( cont < .75, 300,
cont))))]
newdata = data.table(ded = runif(50), cont = runif(50), fac1 = as.factor(LETTERS702[7:12]))
newdata[, cont := as.factor(ifelse(cont < .25, 100,
ifelse(cont < .5, 200,
ifelse( cont < .75, 300,
cont))))]
#add the levels from the old data to the new data
levels(newdata$fac1) = c(levels(newdata$fac1),levels(data$fac1))
levels(newdata$cont) = c(levels(newdata$cont),levels(data$cont))
levels(data$fac1) = c(levels(data$fac1),levels(newdata$fac1))
levels(data$cont) = c(levels(data$cont),levels(newdata$cont))
现在,当我 运行 进行 lm 回归并尝试在新数据中创建预测值时 table,它按预期工作:
reg =lm(ded ~ fac1 + cont, data = data)
summary(reg)
newdata[fac1 %in% data$fac1 & cont %in% data$cont, ded_hat := predict(reg, newdata[fac1 %in% data$fac1 & cont %in% data$cont, ])]
head(newdata)
ded cont fac1 ded_hat
1: 0.1820960 200 G 0.4636306
2: 0.2561465 300 H 0.5412485
3: 0.8711127 300 I 0.5351451
4: 0.9839877 200 J 0.5737994
5: 0.8802973 200 K NA
6: 0.1385486 200 L NA
但是当我尝试使用 biglm
时,所有新的预测值都是 NA:
reg = biglm(ded ~ fac1 + cont, data = data)
summary(reg)
newdata[fac1 %in% data$fac1 & cont %in% data$cont, ded_hat := predict(reg, newdata[fac1 %in% data$fac1 & cont %in% data$cont, ])]
head(newdata)
ded cont fac1 ded_hat
1: 0.1820960 200 G NA
2: 0.2561465 300 H NA
3: 0.8711127 300 I NA
4: 0.9839877 200 J NA
5: 0.8802973 200 K NA
6: 0.1385486 200 L NA
这是怎么回事?我需要更改什么才能让 predict 为 biglm 工作?
看起来 biglm
不会自动删除未使用的水平,或以复杂的方式针对不同的因子水平集进行调整。
如果你这样做
reg1 = lm(ded ~ fac1 + cont, data = data)
reg2 = biglm(ded ~ fac1 + cont, data = data)
并比较系数,您会看到非 NA
值匹配,但有 14 个额外的 NA
值......它们可能通过预测计算传播。
length(coef(reg1))
## [1] 36
length(coef(reg2))
## [1] 50
## unname() and c() remove extraneous attributes
all.equal(unname(coef(reg1)),unname(c(na.omit(coef(reg2))))) ## TRUE
如果你使用droplevels
reg3 = biglm(ded ~ fac1 + cont, data = droplevels(data))
然后系数匹配,但预测失败不同 (Error in x %*% coef(object) : non-conformable arguments
),大概是因为新数据中有一组不同的水平 - 您必须更加努力地设置因子水平,使他们匹配...
这是我最终使用的解决方法:
- 确保新数据具有旧数据的所有级别
- 使用 dummies 包手动扩展每个级别的虚拟人
运行回归并做出预测
require(data.table)
require(dummies)
#create a data set with factor variables and one continuous numerical variable
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
data = data.table(ded = runif(100), cont = runif(100), fac1 = as.factor(LETTERS702[1:10]))
data[, cont := as.factor(ifelse(cont < .25, 100,
ifelse(cont < .5, 200,
ifelse( cont < .75, 300,
cont))))]
newdata = data.table(ded = runif(50), cont = runif(50), fac1 = as.factor(LETTERS702[7:12]))
newdata[, cont := as.factor(ifelse(cont < .25, 100,
ifelse(cont < .5, 200,
ifelse( cont < .75, 300,
cont))))]
#add the levels from the old data to the new data
levels(newdata$fac1) = c(levels(newdata$fac1),levels(data$fac1))
levels(newdata$cont) = c(levels(newdata$cont),levels(data$cont))
#generate a dummy for each fac1
data = as.data.table(dummy.data.frame(data, names = c("fac1","cont")))
newdata = as.data.table(dummy.data.frame(newdata, names = c("fac1","cont"), drop = F))
#a list of variable names in both data sets
datanames = (names(data))
newdatanames = (names(newdata))
varnames = subset(newdatanames, newdatanames %in% datanames)
#regression
reg = lm(ded ~ ., data = data)
#prediction
newdata[, ded_hat := predict(reg, newdata)]
为什么使用 biglm 进行预测与使用 lm 进行预测不同?
这是我生成数据的代码:(诚然草率)
require(data.table)
require(biganalytics)
#create a data set with factor variables and one continuous numerical variable
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
data = data.table(ded = runif(100), cont = runif(100), fac1 = as.factor(LETTERS702[1:10]))
data[, cont := as.factor(ifelse(cont < .25, 100,
ifelse(cont < .5, 200,
ifelse( cont < .75, 300,
cont))))]
newdata = data.table(ded = runif(50), cont = runif(50), fac1 = as.factor(LETTERS702[7:12]))
newdata[, cont := as.factor(ifelse(cont < .25, 100,
ifelse(cont < .5, 200,
ifelse( cont < .75, 300,
cont))))]
#add the levels from the old data to the new data
levels(newdata$fac1) = c(levels(newdata$fac1),levels(data$fac1))
levels(newdata$cont) = c(levels(newdata$cont),levels(data$cont))
levels(data$fac1) = c(levels(data$fac1),levels(newdata$fac1))
levels(data$cont) = c(levels(data$cont),levels(newdata$cont))
现在,当我 运行 进行 lm 回归并尝试在新数据中创建预测值时 table,它按预期工作:
reg =lm(ded ~ fac1 + cont, data = data)
summary(reg)
newdata[fac1 %in% data$fac1 & cont %in% data$cont, ded_hat := predict(reg, newdata[fac1 %in% data$fac1 & cont %in% data$cont, ])]
head(newdata)
ded cont fac1 ded_hat
1: 0.1820960 200 G 0.4636306
2: 0.2561465 300 H 0.5412485
3: 0.8711127 300 I 0.5351451
4: 0.9839877 200 J 0.5737994
5: 0.8802973 200 K NA
6: 0.1385486 200 L NA
但是当我尝试使用 biglm
时,所有新的预测值都是 NA:
reg = biglm(ded ~ fac1 + cont, data = data)
summary(reg)
newdata[fac1 %in% data$fac1 & cont %in% data$cont, ded_hat := predict(reg, newdata[fac1 %in% data$fac1 & cont %in% data$cont, ])]
head(newdata)
ded cont fac1 ded_hat
1: 0.1820960 200 G NA
2: 0.2561465 300 H NA
3: 0.8711127 300 I NA
4: 0.9839877 200 J NA
5: 0.8802973 200 K NA
6: 0.1385486 200 L NA
这是怎么回事?我需要更改什么才能让 predict 为 biglm 工作?
看起来 biglm
不会自动删除未使用的水平,或以复杂的方式针对不同的因子水平集进行调整。
如果你这样做
reg1 = lm(ded ~ fac1 + cont, data = data)
reg2 = biglm(ded ~ fac1 + cont, data = data)
并比较系数,您会看到非 NA
值匹配,但有 14 个额外的 NA
值......它们可能通过预测计算传播。
length(coef(reg1))
## [1] 36
length(coef(reg2))
## [1] 50
## unname() and c() remove extraneous attributes
all.equal(unname(coef(reg1)),unname(c(na.omit(coef(reg2))))) ## TRUE
如果你使用droplevels
reg3 = biglm(ded ~ fac1 + cont, data = droplevels(data))
然后系数匹配,但预测失败不同 (Error in x %*% coef(object) : non-conformable arguments
),大概是因为新数据中有一组不同的水平 - 您必须更加努力地设置因子水平,使他们匹配...
这是我最终使用的解决方法:
- 确保新数据具有旧数据的所有级别
- 使用 dummies 包手动扩展每个级别的虚拟人
运行回归并做出预测
require(data.table) require(dummies) #create a data set with factor variables and one continuous numerical variable LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS))) data = data.table(ded = runif(100), cont = runif(100), fac1 = as.factor(LETTERS702[1:10])) data[, cont := as.factor(ifelse(cont < .25, 100, ifelse(cont < .5, 200, ifelse( cont < .75, 300, cont))))] newdata = data.table(ded = runif(50), cont = runif(50), fac1 = as.factor(LETTERS702[7:12])) newdata[, cont := as.factor(ifelse(cont < .25, 100, ifelse(cont < .5, 200, ifelse( cont < .75, 300, cont))))] #add the levels from the old data to the new data levels(newdata$fac1) = c(levels(newdata$fac1),levels(data$fac1)) levels(newdata$cont) = c(levels(newdata$cont),levels(data$cont)) #generate a dummy for each fac1 data = as.data.table(dummy.data.frame(data, names = c("fac1","cont"))) newdata = as.data.table(dummy.data.frame(newdata, names = c("fac1","cont"), drop = F)) #a list of variable names in both data sets datanames = (names(data)) newdatanames = (names(newdata)) varnames = subset(newdatanames, newdatanames %in% datanames) #regression reg = lm(ded ~ ., data = data) #prediction newdata[, ded_hat := predict(reg, newdata)]