使用 srvyr 包按组划分的比例
Proportions by group with srvyr package
嗨,我有一个带有权重列的数据框,如示例所示:
df <- tibble::tribble(
~id, ~edu, ~q_d1, ~q_d2_1, ~weight,
1L, 1L, 1L, 0L, 1740,
2L, 1L, 1L, 0L, 1428,
3L, 2L, 1L, 2L, 496,
4L, 2L, 1L, 2L, 550,
5L, 3L, 1L, 1L, 1762,
6L, 4L, 1L, 0L, 1004,
7L, 5L, 1L, 0L, 522,
8L, 3L, 2L, 0L, 1099,
9L, 4L, 2L, 2L, 1295
)
我使用 srvyr 包来计算组的汇总统计信息。我的脚本:
sv_design_test <- df %>%
srvyr::as_survey_design(weights = weight)
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(smartphone) %>%
summarize(proportion = srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n())) %>%
select(-proportion_se, -total_se )
输出:
# A tibble: 3 x 4
smartphone proportion total total_unweighted
<chr> <dbl> <dbl> <int>
1 No Internet 0.242 2394 2
2 No smartphone 0.474 4694 4
3 smartphone 0.284 2808 3
但是当我将教育 (edu) 添加到 group_by 时出现错误:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(edu, smartphone) %>%
summarize(proportion = srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n())) %>%
select(-proportion_se, -total_se )
错误信息是:
Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
看来您实际上并不需要 srvyr
df %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(smartphone) %>%
summarise(total = sum(weight),
total_unweighted = n()) %>%
mutate(proportion = prop.table(total))
# A tibble: 3 x 4
smartphone total total_unweighted proportion
<chr> <dbl> <int> <dbl>
1 No Internet 2394 2 0.242
2 No smartphone 4694 4 0.474
3 smartphone 2808 3 0.284
df %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(edu, smartphone) %>%
summarise(total = sum(weight),
total_unweighted = n()) %>%
mutate(proportion = prop.table(total))
# A tibble: 7 x 5
# Groups: edu [5]
edu smartphone total total_unweighted proportion
<int> <chr> <dbl> <int> <dbl>
1 1 No smartphone 3168 2 1
2 2 smartphone 1046 2 1
3 3 No Internet 1099 1 0.384
4 3 smartphone 1762 1 0.616
5 4 No Internet 1295 1 0.563
6 4 No smartphone 1004 1 0.437
7 5 No smartphone 522 1 1
问题
您的错误消息(关于对比的错误消息)说您需要使用因子作为分组变量。在您的原始数据框中,edu
是数字,因此您可以在创建调查设计之前将其转换为一个因子。
library(tidyverse)
library(srvyr)
# ...
sv_design_test <- df %>%
mutate(edu = as.factor(edu)) %>%
srvyr::as_survey_design(weights = weight)
然后在您创建 smartphone
之后,也将其转换为一个因数:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone))
在您的第二条错误消息(关于长度的错误消息)中,这是因为您的 summarise
中有函数 return 不同的行数。您可以通过分别调用这些函数来验证这一点(错误消息说它是参数 3,意思是 n = unweighted(n())
,问题所在)。
这 returns 15 行:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(prop = survey_mean(),
total = survey_total())
#> # A tibble: 15 x 6
#> edu smartphone prop prop_se total total_se
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108.
#> 3 1 smartphone 0 0 0 0
#> 4 2 No Internet 0 0 0 0
#> 5 2 No smartphone 0 0 0 0
#> 6 2 smartphone 1 0 1046 693.
#> 7 3 No Internet 0.384 0.355 1099 1099.
#> 8 3 No smartphone 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762.
#> 10 4 No Internet 0.563 0.369 1295 1295.
#> 11 4 No smartphone 0.437 0.369 1004 1004
#> 12 4 smartphone 0 0 0 0
#> 13 5 No Internet 0 0 0 0
#> 14 5 No smartphone 1 0 522 522
#> 15 5 smartphone 0 0 0 0
虽然这个return只有7个,因为edu
和smartphone
的组合只有7个出现,因此只有7个被计算在内。
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(n = unweighted(n()))
#> # A tibble: 7 x 3
#> edu smartphone n
#> <fct> <fct> <int>
#> 1 1 No smartphone 2
#> 2 2 smartphone 2
#> 3 3 No Internet 1
#> 4 3 smartphone 1
#> 5 4 No Internet 1
#> 6 4 No smartphone 1
#> 7 5 No smartphone 1
解决方案 1:在 group_by()
中使用 .drop = FALSE
通过使用 group_by()
函数的 .drop
参数,您可以强制 summarize()
生成结果,即使对于未出现在数据中的因子水平组合也是如此。
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone,
.drop = FALSE) %>%
summarize(prop= srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n()))
#> # A tibble: 15 x 7
#> edu smartphone prop prop_se total total_se total_unweighted
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108. 2
#> 3 1 smartphone 0 0 0 0 0
#> 4 2 No Internet 0 0 0 0 0
#> 5 2 No smartphone 0 0 0 0 0
#> 6 2 smartphone 1 0 1046 693. 2
#> 7 3 No Internet 0.384 0.355 1099 1099. 1
#> 8 3 No smartphone 0 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762. 1
#> 10 4 No Internet 0.563 0.369 1295 1295. 1
#> 11 4 No smartphone 0.437 0.369 1004 1004 1
#> 12 4 smartphone 0 0 0 0 0
#> 13 5 No Internet 0 0 0 0 0
#> 14 5 No smartphone 1 0 522 522 1
#> 15 5 smartphone 0 0 0 0 0
解决方案 2:加入
您可以制作 2 个不同的汇总数据框,然后加入它们。
我在 n()
之后添加了对 complete
的调用以填充缺失的关卡。制作两个数据框并将它们连接起来得到以下结果:
props <- sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(prop = survey_mean(),
total = survey_total())
counts <- sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(n = unweighted(n())) %>%
complete(edu, smartphone, fill = list(n = 0))
left_join(props, counts, by = c("edu", "smartphone"))
#> # A tibble: 15 x 7
#> edu smartphone prop prop_se total total_se n
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108. 2
#> 3 1 smartphone 0 0 0 0 0
#> 4 2 No Internet 0 0 0 0 0
#> 5 2 No smartphone 0 0 0 0 0
#> 6 2 smartphone 1 0 1046 693. 2
#> 7 3 No Internet 0.384 0.355 1099 1099. 1
#> 8 3 No smartphone 0 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762. 1
#> 10 4 No Internet 0.563 0.369 1295 1295. 1
#> 11 4 No smartphone 0.437 0.369 1004 1004 1
#> 12 4 smartphone 0 0 0 0 0
#> 13 5 No Internet 0 0 0 0 0
#> 14 5 No smartphone 1 0 522 522 1
#> 15 5 smartphone 0 0 0 0 0
嗨,我有一个带有权重列的数据框,如示例所示:
df <- tibble::tribble(
~id, ~edu, ~q_d1, ~q_d2_1, ~weight,
1L, 1L, 1L, 0L, 1740,
2L, 1L, 1L, 0L, 1428,
3L, 2L, 1L, 2L, 496,
4L, 2L, 1L, 2L, 550,
5L, 3L, 1L, 1L, 1762,
6L, 4L, 1L, 0L, 1004,
7L, 5L, 1L, 0L, 522,
8L, 3L, 2L, 0L, 1099,
9L, 4L, 2L, 2L, 1295
)
我使用 srvyr 包来计算组的汇总统计信息。我的脚本:
sv_design_test <- df %>%
srvyr::as_survey_design(weights = weight)
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(smartphone) %>%
summarize(proportion = srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n())) %>%
select(-proportion_se, -total_se )
输出:
# A tibble: 3 x 4
smartphone proportion total total_unweighted
<chr> <dbl> <dbl> <int>
1 No Internet 0.242 2394 2
2 No smartphone 0.474 4694 4
3 smartphone 0.284 2808 3
但是当我将教育 (edu) 添加到 group_by 时出现错误:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(edu, smartphone) %>%
summarize(proportion = srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n())) %>%
select(-proportion_se, -total_se )
错误信息是:
Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
看来您实际上并不需要 srvyr
df %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(smartphone) %>%
summarise(total = sum(weight),
total_unweighted = n()) %>%
mutate(proportion = prop.table(total))
# A tibble: 3 x 4
smartphone total total_unweighted proportion
<chr> <dbl> <int> <dbl>
1 No Internet 2394 2 0.242
2 No smartphone 4694 4 0.474
3 smartphone 2808 3 0.284
df %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(edu, smartphone) %>%
summarise(total = sum(weight),
total_unweighted = n()) %>%
mutate(proportion = prop.table(total))
# A tibble: 7 x 5
# Groups: edu [5]
edu smartphone total total_unweighted proportion
<int> <chr> <dbl> <int> <dbl>
1 1 No smartphone 3168 2 1
2 2 smartphone 1046 2 1
3 3 No Internet 1099 1 0.384
4 3 smartphone 1762 1 0.616
5 4 No Internet 1295 1 0.563
6 4 No smartphone 1004 1 0.437
7 5 No smartphone 522 1 1
问题
您的错误消息(关于对比的错误消息)说您需要使用因子作为分组变量。在您的原始数据框中,edu
是数字,因此您可以在创建调查设计之前将其转换为一个因子。
library(tidyverse)
library(srvyr)
# ...
sv_design_test <- df %>%
mutate(edu = as.factor(edu)) %>%
srvyr::as_survey_design(weights = weight)
然后在您创建 smartphone
之后,也将其转换为一个因数:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone))
在您的第二条错误消息(关于长度的错误消息)中,这是因为您的 summarise
中有函数 return 不同的行数。您可以通过分别调用这些函数来验证这一点(错误消息说它是参数 3,意思是 n = unweighted(n())
,问题所在)。
这 returns 15 行:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(prop = survey_mean(),
total = survey_total())
#> # A tibble: 15 x 6
#> edu smartphone prop prop_se total total_se
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108.
#> 3 1 smartphone 0 0 0 0
#> 4 2 No Internet 0 0 0 0
#> 5 2 No smartphone 0 0 0 0
#> 6 2 smartphone 1 0 1046 693.
#> 7 3 No Internet 0.384 0.355 1099 1099.
#> 8 3 No smartphone 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762.
#> 10 4 No Internet 0.563 0.369 1295 1295.
#> 11 4 No smartphone 0.437 0.369 1004 1004
#> 12 4 smartphone 0 0 0 0
#> 13 5 No Internet 0 0 0 0
#> 14 5 No smartphone 1 0 522 522
#> 15 5 smartphone 0 0 0 0
虽然这个return只有7个,因为edu
和smartphone
的组合只有7个出现,因此只有7个被计算在内。
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(n = unweighted(n()))
#> # A tibble: 7 x 3
#> edu smartphone n
#> <fct> <fct> <int>
#> 1 1 No smartphone 2
#> 2 2 smartphone 2
#> 3 3 No Internet 1
#> 4 3 smartphone 1
#> 5 4 No Internet 1
#> 6 4 No smartphone 1
#> 7 5 No smartphone 1
解决方案 1:在 group_by()
中使用 .drop = FALSE
通过使用 group_by()
函数的 .drop
参数,您可以强制 summarize()
生成结果,即使对于未出现在数据中的因子水平组合也是如此。
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone,
.drop = FALSE) %>%
summarize(prop= srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n()))
#> # A tibble: 15 x 7
#> edu smartphone prop prop_se total total_se total_unweighted
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108. 2
#> 3 1 smartphone 0 0 0 0 0
#> 4 2 No Internet 0 0 0 0 0
#> 5 2 No smartphone 0 0 0 0 0
#> 6 2 smartphone 1 0 1046 693. 2
#> 7 3 No Internet 0.384 0.355 1099 1099. 1
#> 8 3 No smartphone 0 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762. 1
#> 10 4 No Internet 0.563 0.369 1295 1295. 1
#> 11 4 No smartphone 0.437 0.369 1004 1004 1
#> 12 4 smartphone 0 0 0 0 0
#> 13 5 No Internet 0 0 0 0 0
#> 14 5 No smartphone 1 0 522 522 1
#> 15 5 smartphone 0 0 0 0 0
解决方案 2:加入
您可以制作 2 个不同的汇总数据框,然后加入它们。
我在 n()
之后添加了对 complete
的调用以填充缺失的关卡。制作两个数据框并将它们连接起来得到以下结果:
props <- sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(prop = survey_mean(),
total = survey_total())
counts <- sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(n = unweighted(n())) %>%
complete(edu, smartphone, fill = list(n = 0))
left_join(props, counts, by = c("edu", "smartphone"))
#> # A tibble: 15 x 7
#> edu smartphone prop prop_se total total_se n
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108. 2
#> 3 1 smartphone 0 0 0 0 0
#> 4 2 No Internet 0 0 0 0 0
#> 5 2 No smartphone 0 0 0 0 0
#> 6 2 smartphone 1 0 1046 693. 2
#> 7 3 No Internet 0.384 0.355 1099 1099. 1
#> 8 3 No smartphone 0 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762. 1
#> 10 4 No Internet 0.563 0.369 1295 1295. 1
#> 11 4 No smartphone 0.437 0.369 1004 1004 1
#> 12 4 smartphone 0 0 0 0 0
#> 13 5 No Internet 0 0 0 0 0
#> 14 5 No smartphone 1 0 522 522 1
#> 15 5 smartphone 0 0 0 0 0