从 R 中的 map_*() 构建线性回归模型

Building linear regression model from map_*() in R

我正在使用 attrition 数据为每个部门的 AgeMonthlyIncome 之间的关系建立线性模型在减员数据中。此外,我想使用 map 函数通过从 lm_summary 中提取实值 r.squared 元素来添加列 r_squared,从而量化拟合优度。

我的代码:

attrition %>%
  group_by(Department) %>%
  mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
  mutate(r_squared = map_dfc(lm_summary, "r.squared"))

执行时出现以下错误:

essentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableessentially perfect fit: summary may be unreliableNew names:
* NA -> ...1
New names:
* NA -> ...1
New names:
* NA -> ...1

这将进入无限循环。不确定我是否错误地使用了这些功能,或者我应该互换它们。

> dput(head(attrition))
structure(list(Age = c(41L, 49L, 37L, 33L, 27L, 32L), Attrition = structure(c(2L, 
1L, 2L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    BusinessTravel = structure(c(3L, 2L, 3L, 2L, 3L, 2L), .Label = c("Non-Travel", 
    "Travel_Frequently", "Travel_Rarely"), class = "factor"), 
    DailyRate = c(1102L, 279L, 1373L, 1392L, 591L, 1005L), Department = structure(c(3L, 
    2L, 2L, 2L, 2L, 2L), .Label = c("Human_Resources", "Research_Development", 
    "Sales"), class = "factor"), DistanceFromHome = c(1L, 8L, 
    2L, 3L, 2L, 2L), Education = structure(c(2L, 1L, 2L, 4L, 
    1L, 2L), .Label = c("Below_College", "College", "Bachelor", 
    "Master", "Doctor"), class = c("ordered", "factor")), EducationField = structure(c(2L, 
    2L, 5L, 2L, 4L, 2L), .Label = c("Human_Resources", "Life_Sciences", 
    "Marketing", "Medical", "Other", "Technical_Degree"), class = "factor"), 
    EnvironmentSatisfaction = structure(c(2L, 3L, 4L, 4L, 1L, 
    4L), .Label = c("Low", "Medium", "High", "Very_High"), class = c("ordered", 
    "factor")), Gender = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("Female", 
    "Male"), class = "factor"), HourlyRate = c(94L, 61L, 92L, 
    56L, 40L, 79L), JobInvolvement = structure(c(3L, 2L, 2L, 
    3L, 3L, 3L), .Label = c("Low", "Medium", "High", "Very_High"
    ), class = c("ordered", "factor")), JobLevel = c(2L, 2L, 
    1L, 1L, 1L, 1L), JobRole = structure(c(8L, 7L, 3L, 7L, 3L, 
    3L), .Label = c("Healthcare_Representative", "Human_Resources", 
    "Laboratory_Technician", "Manager", "Manufacturing_Director", 
    "Research_Director", "Research_Scientist", "Sales_Executive", 
    "Sales_Representative"), class = "factor"), JobSatisfaction = structure(c(4L, 
    2L, 3L, 3L, 2L, 4L), .Label = c("Low", "Medium", "High", 
    "Very_High"), class = c("ordered", "factor")), MaritalStatus = structure(c(3L, 
    2L, 3L, 2L, 2L, 3L), .Label = c("Divorced", "Married", "Single"
    ), class = "factor"), MonthlyIncome = c(5993L, 5130L, 2090L, 
    2909L, 3468L, 3068L), MonthlyRate = c(19479L, 24907L, 2396L, 
    23159L, 16632L, 11864L), NumCompaniesWorked = c(8L, 1L, 6L, 
    1L, 9L, 0L), OverTime = structure(c(2L, 1L, 2L, 2L, 1L, 1L
    ), .Label = c("No", "Yes"), class = "factor"), PercentSalaryHike = c(11L, 
    23L, 15L, 11L, 12L, 13L), PerformanceRating = structure(c(3L, 
    4L, 3L, 3L, 3L, 3L), .Label = c("Low", "Good", "Excellent", 
    "Outstanding"), class = c("ordered", "factor")), RelationshipSatisfaction = structure(c(1L, 
    4L, 2L, 3L, 4L, 3L), .Label = c("Low", "Medium", "High", 
    "Very_High"), class = c("ordered", "factor")), StockOptionLevel = c(0L, 
    1L, 0L, 0L, 1L, 0L), TotalWorkingYears = c(8L, 10L, 7L, 8L, 
    6L, 8L), TrainingTimesLastYear = c(0L, 3L, 3L, 3L, 3L, 2L
    ), WorkLifeBalance = structure(c(1L, 3L, 3L, 3L, 3L, 2L), .Label = c("Bad", 
    "Good", "Better", "Best"), class = c("ordered", "factor")), 
    YearsAtCompany = c(6L, 10L, 0L, 8L, 2L, 7L), YearsInCurrentRole = c(4L, 
    7L, 0L, 7L, 2L, 7L), YearsSinceLastPromotion = c(0L, 1L, 
    0L, 3L, 2L, 3L), YearsWithCurrManager = c(5L, 7L, 0L, 0L, 
    2L, 6L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", 
"data.frame"))

如果您只对 R2 感兴趣,我认为您不需要 map 功能:您可以按部门分组,然后直接提取 R2

attrition %>%
    group_by(Department) %>%
    mutate(r_squared = summary(lm(MonthlyIncome ~ Age))[['r.squared']]) 

如果您坚持使用map函数,您必须确保您确实提供了一个函数:

attrition %>%
    group_by(Department) %>%
    mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
    mutate(r_squared = purrr::map_dbl(lm_summary, function(x) x[["r.squared"]])) 

您可以使用 pluck 从每个 lm 模型中提取 "r.squared"

library(dplyr)
library(purrr)

attrition %>%
  group_by(Department) %>%
  summarise(lm_summary = list(summary(lm(MonthlyIncome ~ Age))), 
            r_squared = map_dbl(lm_summary, pluck, "r.squared"))

#  Department           lm_summary r_squared
#  <fct>                <list>         <dbl>
#1 Research_Development <smmry.lm>     0.389
#2 Sales                <smmry.lm>     0    

如果要保留数据中的所有行,可以使用 mutate 而不是 summarise

使用base R

do.call(rbind, lapply(split(attrition, attrition$Department, drop = TRUE), 
  function(x) {
            smry <- summary(lm(MonthlyIncome ~ Age, data = x))
            r_squared <- smry$r.squared
             data.frame(lm_summary = I(list(smry)), r_squared)

        }))

-输出

#                     lm_summary r_squared
#Research_Development lm(formu.... 0.3890383
#Sales                lm(formu.... 0.0000000

此外,我们可以使用 dplyr 来完成此操作。优点是我们仍然可以将模型摘要作为 listr.squared 一起获得,而无需使用另一个包

library(dplyr)
attrition %>%
   nest_by(Department) %>% 
   mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age, data = data))),
         r_squared = lm_summary$r.squared) %>%
   ungroup
# A tibble: 2 x 4
#  Department                     data lm_summary r_squared
#  <fct>                <list<tibble>> <list>         <dbl>
#1 Research_Development       [5 × 30] <smmry.lm>     0.389
#2 Sales                      [1 × 30] <smmry.lm>     0    

您也可以使用以下解决方案。尽管某些方面与已经 post 编辑的解决方案非常相似,但我认为 post 它会很好。

library(dplyr)
library(purrr)
library(tidyr)
library(broom)

attrition %>%
  group_nest(Department) %>% 
  mutate(model = map(data, ~ lm(MonthlyIncome ~ Age, data = .)), 
         summary = map(model, ~ glance(.x))) %>%
  unnest(summary) %>%
  select(Department, r.squared)

# A tibble: 2 x 2
  Department           r.squared
  <fct>                    <dbl>
1 Research_Development     0.389
2 Sales                    0