select purrr 循环中的非缺失变量
select non-missing variables in a purrr loop
考虑这个例子
mydata <- data_frame(ind_1 = c(NA,NA,3,4),
ind_2 = c(2,3,4,5),
ind_3 = c(5,6,NA,NA),
y = c(28,34,25,12),
group = c('a','a','b','b'))
> mydata
# A tibble: 4 x 5
ind_1 ind_2 ind_3 y group
<dbl> <dbl> <dbl> <dbl> <chr>
1 NA 2 5 28 a
2 NA 3 6 34 a
3 3 4 NA 25 b
4 4 5 NA 12 b
在这里我想要,对于每个 group
,回归 y
在该组中不丢失的任何变量,并将相应的 lm
对象存储在 list-column
.
即:
- 对于组
a
,这些变量对应于ind_2
和ind_3
- 组
b
,对应ind_1
和ind_2
我尝试了以下方法,但这不起作用
mydata %>% group_by(group) %>% nest() %>%
do(filtered_df <- . %>% select(which(colMeans(is.na(.)) == 0)),
myreg = lm(y~ names(filtered_df)))
有什么想法吗?谢谢!
我们可以使用map
和mutate
。我们可以一步 select
和建模 (nestdat1
),或者如果您想保留过滤后的数据 (nestdat2
),则可以在单独的步骤中使用两个 map
:
library(tidyverse)
nestdat1 <- mydata %>%
group_by(group) %>%
nest() %>%
mutate(model = data %>% map(~ select_if(., funs(!any(is.na(.)))) %>%
lm(y ~ ., data = .)))
nestdat2 <- mydata %>%
group_by(group) %>%
nest() %>%
mutate(data = data %>% map(~ select_if(., funs(!any(is.na(.))))),
model = data %>% map(~ lm(y ~ ., data = .)))
输出:
它们产生不同的 data
列:
> nestdat1 %>% pull(data)
[[1]]
# A tibble: 2 x 4
ind_1 ind_2 ind_3 y
<dbl> <dbl> <dbl> <dbl>
1 NA 2 5 28
2 NA 3 6 34
[[2]]
# A tibble: 2 x 4
ind_1 ind_2 ind_3 y
<dbl> <dbl> <dbl> <dbl>
1 3 4 NA 25
2 4 5 NA 12
> nestdat2 %>% pull(data)
[[1]]
# A tibble: 2 x 3
ind_2 ind_3 y
<dbl> <dbl> <dbl>
1 2 5 28
2 3 6 34
[[2]]
# A tibble: 2 x 3
ind_1 ind_2 y
<dbl> <dbl> <dbl>
1 3 4 25
2 4 5 12
但相同的model
列:
> nestdat1 %>% pull(model)
[[1]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_2 ind_3
16 6 NA
[[2]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_1 ind_2
64 -13 NA
> nestdat2 %>% pull(model)
[[1]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_2 ind_3
16 6 NA
[[2]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_1 ind_2
64 -13 NA
这是另一个 tidyverse
选项,如果您希望将其保留在 tibble
中,请分配给 mydata$model
:
library(tidyverse)
mydata %>%
nest(-group) %>%
pull(data) %>%
map(~lm(y ~., discard(.,anyNA)))
# [[1]]
#
# Call:
# lm(formula = y ~ ., data = discard(., anyNA))
#
# Coefficients:
# (Intercept) ind_2 ind_3
# 16 6 NA
#
#
# [[2]]
#
# Call:
# lm(formula = y ~ ., data = discard(., anyNA))
#
# Coefficients:
# (Intercept) ind_1 ind_2
# 64 -13 NA
#
#
考虑这个例子
mydata <- data_frame(ind_1 = c(NA,NA,3,4),
ind_2 = c(2,3,4,5),
ind_3 = c(5,6,NA,NA),
y = c(28,34,25,12),
group = c('a','a','b','b'))
> mydata
# A tibble: 4 x 5
ind_1 ind_2 ind_3 y group
<dbl> <dbl> <dbl> <dbl> <chr>
1 NA 2 5 28 a
2 NA 3 6 34 a
3 3 4 NA 25 b
4 4 5 NA 12 b
在这里我想要,对于每个 group
,回归 y
在该组中不丢失的任何变量,并将相应的 lm
对象存储在 list-column
.
即:
- 对于组
a
,这些变量对应于ind_2
和ind_3
- 组
b
,对应ind_1
和ind_2
我尝试了以下方法,但这不起作用
mydata %>% group_by(group) %>% nest() %>%
do(filtered_df <- . %>% select(which(colMeans(is.na(.)) == 0)),
myreg = lm(y~ names(filtered_df)))
有什么想法吗?谢谢!
我们可以使用map
和mutate
。我们可以一步 select
和建模 (nestdat1
),或者如果您想保留过滤后的数据 (nestdat2
),则可以在单独的步骤中使用两个 map
:
library(tidyverse)
nestdat1 <- mydata %>%
group_by(group) %>%
nest() %>%
mutate(model = data %>% map(~ select_if(., funs(!any(is.na(.)))) %>%
lm(y ~ ., data = .)))
nestdat2 <- mydata %>%
group_by(group) %>%
nest() %>%
mutate(data = data %>% map(~ select_if(., funs(!any(is.na(.))))),
model = data %>% map(~ lm(y ~ ., data = .)))
输出:
它们产生不同的 data
列:
> nestdat1 %>% pull(data)
[[1]]
# A tibble: 2 x 4
ind_1 ind_2 ind_3 y
<dbl> <dbl> <dbl> <dbl>
1 NA 2 5 28
2 NA 3 6 34
[[2]]
# A tibble: 2 x 4
ind_1 ind_2 ind_3 y
<dbl> <dbl> <dbl> <dbl>
1 3 4 NA 25
2 4 5 NA 12
> nestdat2 %>% pull(data)
[[1]]
# A tibble: 2 x 3
ind_2 ind_3 y
<dbl> <dbl> <dbl>
1 2 5 28
2 3 6 34
[[2]]
# A tibble: 2 x 3
ind_1 ind_2 y
<dbl> <dbl> <dbl>
1 3 4 25
2 4 5 12
但相同的model
列:
> nestdat1 %>% pull(model)
[[1]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_2 ind_3
16 6 NA
[[2]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_1 ind_2
64 -13 NA
> nestdat2 %>% pull(model)
[[1]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_2 ind_3
16 6 NA
[[2]]
Call:
lm(formula = y ~ ., data = .)
Coefficients:
(Intercept) ind_1 ind_2
64 -13 NA
这是另一个 tidyverse
选项,如果您希望将其保留在 tibble
中,请分配给 mydata$model
:
library(tidyverse)
mydata %>%
nest(-group) %>%
pull(data) %>%
map(~lm(y ~., discard(.,anyNA)))
# [[1]]
#
# Call:
# lm(formula = y ~ ., data = discard(., anyNA))
#
# Coefficients:
# (Intercept) ind_2 ind_3
# 16 6 NA
#
#
# [[2]]
#
# Call:
# lm(formula = y ~ ., data = discard(., anyNA))
#
# Coefficients:
# (Intercept) ind_1 ind_2
# 64 -13 NA
#
#