从 R 中的 map_*() 构建线性回归模型
Building linear regression model from map_*() in R
我正在使用 attrition
数据为每个部门的 Age 和 MonthlyIncome 之间的关系建立线性模型在减员数据中。此外,我想使用 map 函数通过从 lm_summary
中提取实值 r.squared
元素来添加列 r_squared
,从而量化拟合优度。
我的代码:
attrition %>%
group_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
mutate(r_squared = map_dfc(lm_summary, "r.squared"))
执行时出现以下错误:
essentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableNew names:
* NA -> ...1
New names:
* NA -> ...1
New names:
* NA -> ...1
这将进入无限循环。不确定我是否错误地使用了这些功能,或者我应该互换它们。
> dput(head(attrition))
structure(list(Age = c(41L, 49L, 37L, 33L, 27L, 32L), Attrition = structure(c(2L,
1L, 2L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
BusinessTravel = structure(c(3L, 2L, 3L, 2L, 3L, 2L), .Label = c("Non-Travel",
"Travel_Frequently", "Travel_Rarely"), class = "factor"),
DailyRate = c(1102L, 279L, 1373L, 1392L, 591L, 1005L), Department = structure(c(3L,
2L, 2L, 2L, 2L, 2L), .Label = c("Human_Resources", "Research_Development",
"Sales"), class = "factor"), DistanceFromHome = c(1L, 8L,
2L, 3L, 2L, 2L), Education = structure(c(2L, 1L, 2L, 4L,
1L, 2L), .Label = c("Below_College", "College", "Bachelor",
"Master", "Doctor"), class = c("ordered", "factor")), EducationField = structure(c(2L,
2L, 5L, 2L, 4L, 2L), .Label = c("Human_Resources", "Life_Sciences",
"Marketing", "Medical", "Other", "Technical_Degree"), class = "factor"),
EnvironmentSatisfaction = structure(c(2L, 3L, 4L, 4L, 1L,
4L), .Label = c("Low", "Medium", "High", "Very_High"), class = c("ordered",
"factor")), Gender = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("Female",
"Male"), class = "factor"), HourlyRate = c(94L, 61L, 92L,
56L, 40L, 79L), JobInvolvement = structure(c(3L, 2L, 2L,
3L, 3L, 3L), .Label = c("Low", "Medium", "High", "Very_High"
), class = c("ordered", "factor")), JobLevel = c(2L, 2L,
1L, 1L, 1L, 1L), JobRole = structure(c(8L, 7L, 3L, 7L, 3L,
3L), .Label = c("Healthcare_Representative", "Human_Resources",
"Laboratory_Technician", "Manager", "Manufacturing_Director",
"Research_Director", "Research_Scientist", "Sales_Executive",
"Sales_Representative"), class = "factor"), JobSatisfaction = structure(c(4L,
2L, 3L, 3L, 2L, 4L), .Label = c("Low", "Medium", "High",
"Very_High"), class = c("ordered", "factor")), MaritalStatus = structure(c(3L,
2L, 3L, 2L, 2L, 3L), .Label = c("Divorced", "Married", "Single"
), class = "factor"), MonthlyIncome = c(5993L, 5130L, 2090L,
2909L, 3468L, 3068L), MonthlyRate = c(19479L, 24907L, 2396L,
23159L, 16632L, 11864L), NumCompaniesWorked = c(8L, 1L, 6L,
1L, 9L, 0L), OverTime = structure(c(2L, 1L, 2L, 2L, 1L, 1L
), .Label = c("No", "Yes"), class = "factor"), PercentSalaryHike = c(11L,
23L, 15L, 11L, 12L, 13L), PerformanceRating = structure(c(3L,
4L, 3L, 3L, 3L, 3L), .Label = c("Low", "Good", "Excellent",
"Outstanding"), class = c("ordered", "factor")), RelationshipSatisfaction = structure(c(1L,
4L, 2L, 3L, 4L, 3L), .Label = c("Low", "Medium", "High",
"Very_High"), class = c("ordered", "factor")), StockOptionLevel = c(0L,
1L, 0L, 0L, 1L, 0L), TotalWorkingYears = c(8L, 10L, 7L, 8L,
6L, 8L), TrainingTimesLastYear = c(0L, 3L, 3L, 3L, 3L, 2L
), WorkLifeBalance = structure(c(1L, 3L, 3L, 3L, 3L, 2L), .Label = c("Bad",
"Good", "Better", "Best"), class = c("ordered", "factor")),
YearsAtCompany = c(6L, 10L, 0L, 8L, 2L, 7L), YearsInCurrentRole = c(4L,
7L, 0L, 7L, 2L, 7L), YearsSinceLastPromotion = c(0L, 1L,
0L, 3L, 2L, 3L), YearsWithCurrManager = c(5L, 7L, 0L, 0L,
2L, 6L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
如果您只对 R2
感兴趣,我认为您不需要 map
功能:您可以按部门分组,然后直接提取 R2
:
attrition %>%
group_by(Department) %>%
mutate(r_squared = summary(lm(MonthlyIncome ~ Age))[['r.squared']])
如果您坚持使用map
函数,您必须确保您确实提供了一个函数:
attrition %>%
group_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
mutate(r_squared = purrr::map_dbl(lm_summary, function(x) x[["r.squared"]]))
您可以使用 pluck
从每个 lm
模型中提取 "r.squared"
。
library(dplyr)
library(purrr)
attrition %>%
group_by(Department) %>%
summarise(lm_summary = list(summary(lm(MonthlyIncome ~ Age))),
r_squared = map_dbl(lm_summary, pluck, "r.squared"))
# Department lm_summary r_squared
# <fct> <list> <dbl>
#1 Research_Development <smmry.lm> 0.389
#2 Sales <smmry.lm> 0
如果要保留数据中的所有行,可以使用 mutate
而不是 summarise
。
使用base R
do.call(rbind, lapply(split(attrition, attrition$Department, drop = TRUE),
function(x) {
smry <- summary(lm(MonthlyIncome ~ Age, data = x))
r_squared <- smry$r.squared
data.frame(lm_summary = I(list(smry)), r_squared)
}))
-输出
# lm_summary r_squared
#Research_Development lm(formu.... 0.3890383
#Sales lm(formu.... 0.0000000
此外,我们可以使用 dplyr
来完成此操作。优点是我们仍然可以将模型摘要作为 list
与 r.squared
一起获得,而无需使用另一个包
library(dplyr)
attrition %>%
nest_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age, data = data))),
r_squared = lm_summary$r.squared) %>%
ungroup
# A tibble: 2 x 4
# Department data lm_summary r_squared
# <fct> <list<tibble>> <list> <dbl>
#1 Research_Development [5 × 30] <smmry.lm> 0.389
#2 Sales [1 × 30] <smmry.lm> 0
您也可以使用以下解决方案。尽管某些方面与已经 post 编辑的解决方案非常相似,但我认为 post 它会很好。
library(dplyr)
library(purrr)
library(tidyr)
library(broom)
attrition %>%
group_nest(Department) %>%
mutate(model = map(data, ~ lm(MonthlyIncome ~ Age, data = .)),
summary = map(model, ~ glance(.x))) %>%
unnest(summary) %>%
select(Department, r.squared)
# A tibble: 2 x 2
Department r.squared
<fct> <dbl>
1 Research_Development 0.389
2 Sales 0
我正在使用 attrition
数据为每个部门的 Age 和 MonthlyIncome 之间的关系建立线性模型在减员数据中。此外,我想使用 map 函数通过从 lm_summary
中提取实值 r.squared
元素来添加列 r_squared
,从而量化拟合优度。
我的代码:
attrition %>%
group_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
mutate(r_squared = map_dfc(lm_summary, "r.squared"))
执行时出现以下错误:
essentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableNew names:
* NA -> ...1
New names:
* NA -> ...1
New names:
* NA -> ...1
这将进入无限循环。不确定我是否错误地使用了这些功能,或者我应该互换它们。
> dput(head(attrition))
structure(list(Age = c(41L, 49L, 37L, 33L, 27L, 32L), Attrition = structure(c(2L,
1L, 2L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
BusinessTravel = structure(c(3L, 2L, 3L, 2L, 3L, 2L), .Label = c("Non-Travel",
"Travel_Frequently", "Travel_Rarely"), class = "factor"),
DailyRate = c(1102L, 279L, 1373L, 1392L, 591L, 1005L), Department = structure(c(3L,
2L, 2L, 2L, 2L, 2L), .Label = c("Human_Resources", "Research_Development",
"Sales"), class = "factor"), DistanceFromHome = c(1L, 8L,
2L, 3L, 2L, 2L), Education = structure(c(2L, 1L, 2L, 4L,
1L, 2L), .Label = c("Below_College", "College", "Bachelor",
"Master", "Doctor"), class = c("ordered", "factor")), EducationField = structure(c(2L,
2L, 5L, 2L, 4L, 2L), .Label = c("Human_Resources", "Life_Sciences",
"Marketing", "Medical", "Other", "Technical_Degree"), class = "factor"),
EnvironmentSatisfaction = structure(c(2L, 3L, 4L, 4L, 1L,
4L), .Label = c("Low", "Medium", "High", "Very_High"), class = c("ordered",
"factor")), Gender = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("Female",
"Male"), class = "factor"), HourlyRate = c(94L, 61L, 92L,
56L, 40L, 79L), JobInvolvement = structure(c(3L, 2L, 2L,
3L, 3L, 3L), .Label = c("Low", "Medium", "High", "Very_High"
), class = c("ordered", "factor")), JobLevel = c(2L, 2L,
1L, 1L, 1L, 1L), JobRole = structure(c(8L, 7L, 3L, 7L, 3L,
3L), .Label = c("Healthcare_Representative", "Human_Resources",
"Laboratory_Technician", "Manager", "Manufacturing_Director",
"Research_Director", "Research_Scientist", "Sales_Executive",
"Sales_Representative"), class = "factor"), JobSatisfaction = structure(c(4L,
2L, 3L, 3L, 2L, 4L), .Label = c("Low", "Medium", "High",
"Very_High"), class = c("ordered", "factor")), MaritalStatus = structure(c(3L,
2L, 3L, 2L, 2L, 3L), .Label = c("Divorced", "Married", "Single"
), class = "factor"), MonthlyIncome = c(5993L, 5130L, 2090L,
2909L, 3468L, 3068L), MonthlyRate = c(19479L, 24907L, 2396L,
23159L, 16632L, 11864L), NumCompaniesWorked = c(8L, 1L, 6L,
1L, 9L, 0L), OverTime = structure(c(2L, 1L, 2L, 2L, 1L, 1L
), .Label = c("No", "Yes"), class = "factor"), PercentSalaryHike = c(11L,
23L, 15L, 11L, 12L, 13L), PerformanceRating = structure(c(3L,
4L, 3L, 3L, 3L, 3L), .Label = c("Low", "Good", "Excellent",
"Outstanding"), class = c("ordered", "factor")), RelationshipSatisfaction = structure(c(1L,
4L, 2L, 3L, 4L, 3L), .Label = c("Low", "Medium", "High",
"Very_High"), class = c("ordered", "factor")), StockOptionLevel = c(0L,
1L, 0L, 0L, 1L, 0L), TotalWorkingYears = c(8L, 10L, 7L, 8L,
6L, 8L), TrainingTimesLastYear = c(0L, 3L, 3L, 3L, 3L, 2L
), WorkLifeBalance = structure(c(1L, 3L, 3L, 3L, 3L, 2L), .Label = c("Bad",
"Good", "Better", "Best"), class = c("ordered", "factor")),
YearsAtCompany = c(6L, 10L, 0L, 8L, 2L, 7L), YearsInCurrentRole = c(4L,
7L, 0L, 7L, 2L, 7L), YearsSinceLastPromotion = c(0L, 1L,
0L, 3L, 2L, 3L), YearsWithCurrManager = c(5L, 7L, 0L, 0L,
2L, 6L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
如果您只对 R2
感兴趣,我认为您不需要 map
功能:您可以按部门分组,然后直接提取 R2
:
attrition %>%
group_by(Department) %>%
mutate(r_squared = summary(lm(MonthlyIncome ~ Age))[['r.squared']])
如果您坚持使用map
函数,您必须确保您确实提供了一个函数:
attrition %>%
group_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
mutate(r_squared = purrr::map_dbl(lm_summary, function(x) x[["r.squared"]]))
您可以使用 pluck
从每个 lm
模型中提取 "r.squared"
。
library(dplyr)
library(purrr)
attrition %>%
group_by(Department) %>%
summarise(lm_summary = list(summary(lm(MonthlyIncome ~ Age))),
r_squared = map_dbl(lm_summary, pluck, "r.squared"))
# Department lm_summary r_squared
# <fct> <list> <dbl>
#1 Research_Development <smmry.lm> 0.389
#2 Sales <smmry.lm> 0
如果要保留数据中的所有行,可以使用 mutate
而不是 summarise
。
使用base R
do.call(rbind, lapply(split(attrition, attrition$Department, drop = TRUE),
function(x) {
smry <- summary(lm(MonthlyIncome ~ Age, data = x))
r_squared <- smry$r.squared
data.frame(lm_summary = I(list(smry)), r_squared)
}))
-输出
# lm_summary r_squared
#Research_Development lm(formu.... 0.3890383
#Sales lm(formu.... 0.0000000
此外,我们可以使用 dplyr
来完成此操作。优点是我们仍然可以将模型摘要作为 list
与 r.squared
一起获得,而无需使用另一个包
library(dplyr)
attrition %>%
nest_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age, data = data))),
r_squared = lm_summary$r.squared) %>%
ungroup
# A tibble: 2 x 4
# Department data lm_summary r_squared
# <fct> <list<tibble>> <list> <dbl>
#1 Research_Development [5 × 30] <smmry.lm> 0.389
#2 Sales [1 × 30] <smmry.lm> 0
您也可以使用以下解决方案。尽管某些方面与已经 post 编辑的解决方案非常相似,但我认为 post 它会很好。
library(dplyr)
library(purrr)
library(tidyr)
library(broom)
attrition %>%
group_nest(Department) %>%
mutate(model = map(data, ~ lm(MonthlyIncome ~ Age, data = .)),
summary = map(model, ~ glance(.x))) %>%
unnest(summary) %>%
select(Department, r.squared)
# A tibble: 2 x 2
Department r.squared
<fct> <dbl>
1 Research_Development 0.389
2 Sales 0